From b03f11dfde4566ffeed2b473c3d6e8bd8aea557f Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Tue, 17 May 2022 09:48:04 -0700
Subject: [PATCH 01/59] [Hexagon]Use requires_hexagon instead of
 requires_hexagon_toolchain if running on hexagon target (#11294)

* refactor requires_hexagon_toolchain

* trigger

* lint
---
 python/tvm/testing/utils.py                      |  4 ++++
 .../contrib/test_hexagon/benchmark_hexagon.py    | 12 +++---------
 .../test_hexagon/test_2d_physical_buffers.py     | 10 +++++++---
 .../python/contrib/test_hexagon/test_launcher.py | 16 +++++++---------
 tests/python/contrib/test_hexagon/test_models.py |  6 ++----
 .../contrib/test_hexagon/test_run_unit_tests.py  |  9 +++++----
 .../contrib/test_hexagon/test_thread_pool.py     | 13 +++----------
 .../test_hexagon/topi/test_batch_matmul.py       |  6 +++---
 .../test_hexagon/topi/test_cache_read_write.py   |  7 +++----
 .../test_hexagon/topi/test_conv2d_nchw.py        |  4 +---
 .../test_hexagon/topi/test_conv2d_nhwc.py        |  5 +----
 .../test_hexagon/topi/test_conv2d_transpose.py   |  4 +---
 .../contrib/test_hexagon/topi/test_dense.py      |  4 +---
 .../test_hexagon/topi/test_depthwise_conv2d.py   |  3 +--
 .../contrib/test_hexagon/topi/test_pooling.py    | 16 +++++++---------
 .../contrib/test_hexagon/topi/test_reduce.py     |  4 +---
 .../contrib/test_hexagon/topi/test_softmax.py    |  4 +---
 17 files changed, 51 insertions(+), 76 deletions(-)

diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index b86596feed6b7..8be5cc8ec4715 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -404,6 +404,10 @@ def _get_targets(target_str=None):
         if target_kind == "cuda" and "cudnn" in tvm.target.Target(target).attrs.get("libs", []):
             is_enabled = tvm.support.libinfo()["USE_CUDNN"].lower() in ["on", "true", "1"]
             is_runnable = is_enabled and cudnn.exists()
+        elif target_kind == "hexagon":
+            is_enabled = tvm.support.libinfo()["USE_HEXAGON"].lower() in ["on", "true", "1"]
+            # If Hexagon has compile-time support, we can always fall back
+            is_runnable = is_enabled and "ANDROID_SERIAL_NUMBER" in os.environ
         else:
             is_enabled = tvm.runtime.enabled(target_kind)
             is_runnable = is_enabled and tvm.device(target_kind).exist
diff --git a/tests/python/contrib/test_hexagon/benchmark_hexagon.py b/tests/python/contrib/test_hexagon/benchmark_hexagon.py
index f17530c3efdca..979bd111707b0 100644
--- a/tests/python/contrib/test_hexagon/benchmark_hexagon.py
+++ b/tests/python/contrib/test_hexagon/benchmark_hexagon.py
@@ -27,13 +27,7 @@
 
 import tvm.testing
 from tvm import te
-from tvm import relay
-from tvm.relay.backend import Executor, Runtime
-from tvm.contrib import utils, ndk
-from tvm.contrib.hexagon.build import HexagonLauncher
-import tvm.contrib.hexagon as hexagon
-
-from .conftest import requires_hexagon_toolchain
+from tvm.contrib.hexagon.build import HexagonLauncherRPC
 
 RPC_SERVER_PORT = 7070
 
@@ -47,8 +41,8 @@
 # server to bind to the same port until the wait time elapses.
 
 
-@requires_hexagon_toolchain
-def test_elemwise_add(android_serial_number, hexagon_launcher):
+@tvm.testing.requires_hexagon
+def test_elemwise_add(hexagon_launcher: HexagonLauncherRPC):
     """
     Starting with an elementwise-add computation, try various schedules / optimizations to
     see the impact they have on performance.
diff --git a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
index 9de55996b031e..78e1eb11ad9fd 100644
--- a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
+++ b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
@@ -19,8 +19,6 @@
 
 import contextlib
 import sys
-import tempfile
-import pathlib
 
 import pytest
 import numpy as np
@@ -272,6 +270,12 @@ def test_lower(self, schedule_args):
 
     @requires_hexagon_toolchain
     def test_build(self, schedule_args, target_host, input_layout, working_layout, output_layout):
+        """Testing build success/failure
+
+        * On Hexagon targets, build must succeed for both 1-d and 2-d memory.
+        * On non-Hexagon targets, build must succeed 1-d memory.
+        * On non-Hexagon targets, build must fail and report an error for 2-d memory.
+        """
         # contextlib.nullcontext wasn't added until python3.7, and the
         # CI currently runs on python3.6.  Therefore, using ExitStack
         # to manage an optional context instead.
@@ -292,7 +296,7 @@ def runtime_module(self, schedule_args, target_host):
 
         return tvm.build(*schedule_args, target=target_host)
 
-    @requires_hexagon_toolchain
+    @tvm.testing.requires_hexagon
     def test_execute(
         self,
         runtime_module,
diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index 7dadc8f2f4ab6..5c5e8f6c39f16 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -25,10 +25,8 @@
 from tvm.relay.backend import Executor, Runtime
 from tvm.contrib.hexagon.session import Session
 
-from .conftest import requires_hexagon_toolchain
 
-
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_add(hexagon_session: Session):
     dtype = "int8"
     A = tvm.te.placeholder((2,), dtype=dtype)
@@ -53,7 +51,7 @@ def test_add(hexagon_session: Session):
     assert (C_data.numpy() == np.array([6, 7])).all()
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_add_vtcm(hexagon_session: Session):
     dtype = "int8"
     A = tvm.te.placeholder((2,), dtype=dtype)
@@ -87,7 +85,7 @@ class TestMatMul:
     N = tvm.testing.parameter(32)
     K = tvm.testing.parameter(32)
 
-    @requires_hexagon_toolchain
+    @tvm.testing.requires_hexagon
     def test_matmul(self, hexagon_session, M, N, K):
         X = te.placeholder((M, K), dtype="float32")
         Y = te.placeholder((K, N), dtype="float32")
@@ -122,7 +120,7 @@ def test_matmul(self, hexagon_session, M, N, K):
         tvm.testing.assert_allclose(zt.numpy(), ztcpu.numpy(), rtol=1e-4)
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_graph_executor(hexagon_session: Session):
     dtype = "float32"
     data = relay.var("data", relay.TensorType((1, 64, 64, 3), dtype))
@@ -178,7 +176,7 @@ def test_graph_executor(hexagon_session: Session):
     tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_graph_executor_multiple_conv2d(hexagon_session: Session):
     dtype = "float32"
     input_shape = (1, 8, 8, 3)
@@ -255,7 +253,7 @@ def test_graph_executor_multiple_conv2d(hexagon_session: Session):
     tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_aot_executor(hexagon_session: Session, aot_host_target, aot_target):
     dtype = "float32"
     input_shape = (1, 128, 128, 3)
@@ -314,7 +312,7 @@ def test_aot_executor(hexagon_session: Session, aot_host_target, aot_target):
     tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_aot_executor_multiple_conv2d(hexagon_session: Session, aot_host_target, aot_target):
     dtype = "float32"
     input_shape = (1, 8, 8, 3)
diff --git a/tests/python/contrib/test_hexagon/test_models.py b/tests/python/contrib/test_hexagon/test_models.py
index 649cc5b3f4dda..74f52f20d97c0 100644
--- a/tests/python/contrib/test_hexagon/test_models.py
+++ b/tests/python/contrib/test_hexagon/test_models.py
@@ -24,8 +24,6 @@
 from tvm.relay.backend import Executor, Runtime
 from tvm.contrib.hexagon.session import Session
 
-from .conftest import requires_hexagon_toolchain
-
 
 def get_mobilenet():
     """Download and import mobilenet model with ONNX"""
@@ -38,7 +36,7 @@ def get_mobilenet():
     return onnx.load(model_path)
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_mobilenet(hexagon_session: Session):
     dtype = "float32"
     onnx_model = get_mobilenet()
@@ -88,7 +86,7 @@ def test_mobilenet(hexagon_session: Session):
 enable_usmp = tvm.testing.parameter(False, True)
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_mobilenet_aot(hexagon_session: Session, aot_host_target, aot_target, enable_usmp):
     if hexagon_session._launcher._serial_number == "simulator":
         pytest.skip(msg="Skip on simulator due to long runtime.")
diff --git a/tests/python/contrib/test_hexagon/test_run_unit_tests.py b/tests/python/contrib/test_hexagon/test_run_unit_tests.py
index 3a383d30e5f4b..010c79b8f5544 100644
--- a/tests/python/contrib/test_hexagon/test_run_unit_tests.py
+++ b/tests/python/contrib/test_hexagon/test_run_unit_tests.py
@@ -18,20 +18,21 @@
 import os
 import pytest
 import numpy as np
-from tvm.contrib.hexagon.build import HexagonLauncher
-from .conftest import requires_hexagon_toolchain
+
+import tvm
+from tvm.contrib.hexagon.session import Session
 
 
 # use pytest -sv to observe gtest output
 # use --gtest_args to pass arguments to gtest
 # for example to run all "foo" tests twice and observe gtest output run
 # pytest -sv <this file> --gtests_args="--gtest_filter=*foo* --gtest_repeat=2"
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 @pytest.mark.skipif(
     os.environ.get("HEXAGON_GTEST") == None,
     reason="Test requires environment variable HEXAGON_GTEST set with a path to a Hexagon gtest version normally located at /path/to/hexagon/sdk/utils/googletest/gtest",
 )
-def test_run_unit_tests(hexagon_session, gtest_args):
+def test_run_unit_tests(hexagon_session: Session, gtest_args):
     try:
         func = hexagon_session._rpc.get_function("hexagon.run_unit_tests")
     except:
diff --git a/tests/python/contrib/test_hexagon/test_thread_pool.py b/tests/python/contrib/test_hexagon/test_thread_pool.py
index 8a35bff7e7c97..d95c4120b7758 100644
--- a/tests/python/contrib/test_hexagon/test_thread_pool.py
+++ b/tests/python/contrib/test_hexagon/test_thread_pool.py
@@ -25,7 +25,6 @@
 import tvm.testing
 from tvm import te
 
-from .conftest import requires_hexagon_toolchain
 from tvm.script import tir as T
 
 
@@ -67,11 +66,8 @@ def benchmark_func(mod, name, args, hexagon_session):
     return evaluator(a, b, c, n).mean
 
 
-@requires_hexagon_toolchain
-def test_speedup(hexagon_session, capsys):
-    if hexagon_session is None:
-        pytest.skip(msg="Skip hardware test, ANDROID_SERIAL_NUMBER is not set.")
-
+@tvm.testing.requires_hexagon
+def test_speedup(hexagon_session: Session, capsys):
     target_hexagon = tvm.target.hexagon("v68", link_params=True)
     func = tvm.build(
         ElemwiseSumIRModule, target=tvm.target.Target(target_hexagon, host=target_hexagon)
@@ -85,11 +81,8 @@ def test_speedup(hexagon_session, capsys):
         print("... speedup of {:.2f}".format(serial_mean / parallel_mean), end=" ")
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_elemwise_sum_parallel(hexagon_session: Session):
-    if hexagon_session is None:
-        pytest.skip(msg="Skip hardware test, ANDROID_SERIAL_NUMBER is not set.")
-
     target_hexagon = tvm.target.hexagon("v68", link_params=True)
     func = tvm.build(
         ElemwiseSumIRModule, target=tvm.target.Target(target_hexagon, host=target_hexagon)
diff --git a/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py b/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
index 2816322b6d43d..093ce37e5efaa 100644
--- a/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
+++ b/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
@@ -25,8 +25,8 @@
 from tvm.contrib.hexagon.session import Session
 import tvm.topi.testing
 from tvm.topi.utils import get_const_tuple
+from tvm.contrib.hexagon.session import Session
 
-from ..conftest import requires_hexagon_toolchain
 
 dtype = tvm.testing.parameter(
     "float32",
@@ -46,7 +46,7 @@ class TestMatMulFloat:
     )
 
     # TODO(mehrdadh): add dynamic testing
-    @requires_hexagon_toolchain
+    @tvm.testing.requires_hexagon
     def test_batch_matmul(self, hexagon_session: Session, x_batch, y_batch, M, N, K, dtype):
         if dtype == "float16":
             pytest.xfail("float16 is not supported.")
@@ -98,7 +98,7 @@ class TestMatMulInt8:
         (5, 1, 16, 16, 32),
     )
 
-    @requires_hexagon_toolchain
+    @tvm.testing.requires_hexagon
     def test_batch_matmul_int8(self, hexagon_session: Session, x_batch, y_batch, M, N, K):
         dtype = "int8"
         out_dtype = "int8"
diff --git a/tests/python/contrib/test_hexagon/topi/test_cache_read_write.py b/tests/python/contrib/test_hexagon/topi/test_cache_read_write.py
index bfb597f7b7f38..435ab7190752c 100644
--- a/tests/python/contrib/test_hexagon/topi/test_cache_read_write.py
+++ b/tests/python/contrib/test_hexagon/topi/test_cache_read_write.py
@@ -21,8 +21,7 @@
 
 import tvm.testing
 from tvm import te
-
-from ..conftest import requires_hexagon_toolchain
+from tvm.contrib.hexagon.session import Session
 
 
 def intrin_mem_copy(shape, dtype, dst_scope, src_scope):
@@ -98,7 +97,7 @@ def verify(hexagon_session: Session, s, x, y, z, size):
     np.testing.assert_equal(zt.numpy(), ref)
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_cache_read_write(hexagon_session: Session):
     size = 128
     outer_shape = (size,)
@@ -140,7 +139,7 @@ def layout_transform_2d(n):
     return [n // 16, te.AXIS_SEPARATOR, n % 16]
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_cache_read_write_2d(hexagon_session: Session):
     size = 128
     outer_shape = (size,)
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
index b3d6832ffaa9b..7f530a5c4d809 100644
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
@@ -27,8 +27,6 @@
 from tvm.topi.utils import get_const_tuple
 from tvm.topi.nn.utils import get_pad_tuple
 
-from ..conftest import requires_hexagon_toolchain
-
 
 dtype = tvm.testing.parameter("float32")
 random_seed = tvm.testing.parameter(0)
@@ -91,7 +89,7 @@ class BaseConv2DTests:
     dilation = tvm.testing.parameter(1)
     batch = tvm.testing.parameter(1)
 
-    @requires_hexagon_toolchain
+    @tvm.testing.requires_hexagon
     def test_conv2d_nchw(
         self,
         hexagon_session: Session,
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
index 30b54d51348db..74a3f8dafa3ed 100644
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
@@ -25,9 +25,6 @@
 from tvm.contrib.hexagon.session import Session
 import tvm.topi.testing
 from tvm.topi.utils import get_const_tuple
-from tvm.topi.nn.utils import get_pad_tuple
-
-from ..conftest import requires_hexagon_toolchain
 
 dtype = tvm.testing.parameter("float32")
 
@@ -46,7 +43,7 @@ def ref_data(dtype, batch, in_channel, in_size, num_filter, kernel, stride, padd
 
 
 class BaseConv2DTests:
-    @requires_hexagon_toolchain
+    @tvm.testing.requires_hexagon
     def test_conv2d_nhwc(
         self,
         hexagon_session: Session,
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py
index 0da740614f9db..629403965eae8 100644
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py
@@ -22,9 +22,7 @@
 from tvm import te
 from tvm import topi
 import tvm.topi.testing
-from tvm.contrib.pickle_memoize import memoize
 from tvm.topi.utils import get_const_tuple
-from ..conftest import requires_hexagon_toolchain
 
 
 # TODO Should add kernal to tvm.testing.fixture
@@ -68,7 +66,7 @@ def shift_shape(output_padding):
 
 
 class BaseConv2DTransposeTests:
-    @requires_hexagon_toolchain
+    @tvm.testing.requires_hexagon
     def test_conv2d(
         self,
         hexagon_session: Session,
diff --git a/tests/python/contrib/test_hexagon/topi/test_dense.py b/tests/python/contrib/test_hexagon/topi/test_dense.py
index c63873a62d96d..189b05fcaade0 100644
--- a/tests/python/contrib/test_hexagon/topi/test_dense.py
+++ b/tests/python/contrib/test_hexagon/topi/test_dense.py
@@ -26,8 +26,6 @@
 import tvm.topi.testing
 from tvm.topi.utils import get_const_tuple
 
-from ..conftest import requires_hexagon_toolchain
-
 random_seed = tvm.testing.parameter(0)
 
 use_bias = tvm.testing.parameter(True, False)
@@ -68,7 +66,7 @@ def dense_ref_data(random_seed, batch_size, in_dim, out_dim, use_bias, in_dtype,
     return (a_np, b_np, c_np, d_np)
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_dense(
     hexagon_session: Session,
     batch_size,
diff --git a/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py b/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py
index ab2ce36e1f826..63ae0e7b3253b 100644
--- a/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py
+++ b/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py
@@ -28,7 +28,6 @@
 from tvm import te, topi
 from tvm.topi.utils import get_const_tuple
 from tvm.topi.nn.utils import get_pad_tuple
-from ..conftest import requires_hexagon_toolchain
 
 
 random_seed = tvm.testing.parameter(0)
@@ -155,7 +154,7 @@ class BaseDepthwiseConv2D:
     (e.g. implemented only for llvm).
     """
 
-    @requires_hexagon_toolchain
+    @tvm.testing.requires_hexagon
     def test_conv2d(
         self,
         hexagon_session: Session,
diff --git a/tests/python/contrib/test_hexagon/topi/test_pooling.py b/tests/python/contrib/test_hexagon/topi/test_pooling.py
index 38b7f387e5c67..9ce54bf9a6ebb 100644
--- a/tests/python/contrib/test_hexagon/topi/test_pooling.py
+++ b/tests/python/contrib/test_hexagon/topi/test_pooling.py
@@ -26,8 +26,6 @@
 import tvm.topi.testing
 from tvm.topi.utils import get_const_tuple
 
-from ..conftest import requires_hexagon_toolchain
-
 
 class TestAdaptivePool:
     dshape, out_size, pool_type, layout = tvm.testing.parameters(
@@ -57,7 +55,7 @@ class TestAdaptivePool:
         ((1, 16, 32, 32, 32), (2, 4, 4), "max", "NDHWC"),
     )
 
-    @requires_hexagon_toolchain
+    @tvm.testing.requires_hexagon
     def test_adaptive_pool(self, hexagon_session: Session, dshape, out_size, pool_type, layout):
         dtype = "float32"
         np_data = np.random.uniform(low=0, high=255, size=dshape).astype(dtype)
@@ -233,10 +231,10 @@ class TestPool1D:
         ([1, 31, 16], [3], [3], [3], [3, 0], "max", True, True, "NWC"),
     )
 
-    @requires_hexagon_toolchain
+    @tvm.testing.requires_hexagon
     def test_pool1d(
         self,
-        hexagon_session,
+        hexagon_session: Session,
         input_shape,
         kernel,
         stride,
@@ -310,10 +308,10 @@ class TestPool2D:
         ([1, 31, 31, 16], [3, 3], [3, 3], [2, 2], [3, 2, 1, 0], "max", True, True, "NHWC"),
     )
 
-    @requires_hexagon_toolchain
+    @tvm.testing.requires_hexagon
     def test_pool2d(
         self,
-        hexagon_session,
+        hexagon_session: Session,
         input_shape,
         kernel,
         stride,
@@ -708,10 +706,10 @@ class TestPool3D:
         ),
     )
 
-    @requires_hexagon_toolchain
+    @tvm.testing.requires_hexagon
     def test_pool3d(
         self,
-        hexagon_session,
+        hexagon_session: Session,
         input_shape,
         kernel,
         stride,
diff --git a/tests/python/contrib/test_hexagon/topi/test_reduce.py b/tests/python/contrib/test_hexagon/topi/test_reduce.py
index beacb8cd18005..203a2bd31d6e1 100644
--- a/tests/python/contrib/test_hexagon/topi/test_reduce.py
+++ b/tests/python/contrib/test_hexagon/topi/test_reduce.py
@@ -25,8 +25,6 @@
 from tvm.contrib.hexagon.session import Session
 import tvm.topi.testing
 
-from ..conftest import requires_hexagon_toolchain
-
 
 in_shape, axis, keepdims, reduce_type, dtype = tvm.testing.parameters(
     ((32,), 0, False, "argmax", "float32"),
@@ -101,7 +99,7 @@ def ref_data(in_shape, axis, keepdims, reduce_type, dtype):
     return in_npy, in_npy_map, out_npy
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_reduce_map(
     hexagon_session: Session, ref_data, in_shape, axis, keepdims, reduce_type, dtype
 ):
diff --git a/tests/python/contrib/test_hexagon/topi/test_softmax.py b/tests/python/contrib/test_hexagon/topi/test_softmax.py
index 6857decabf958..7e734af7e0265 100644
--- a/tests/python/contrib/test_hexagon/topi/test_softmax.py
+++ b/tests/python/contrib/test_hexagon/topi/test_softmax.py
@@ -26,8 +26,6 @@
 import tvm.topi.testing
 from tvm.topi.utils import get_const_tuple
 
-from ..conftest import requires_hexagon_toolchain
-
 dtype = tvm.testing.parameter(
     "float16",
     "float32",
@@ -54,7 +52,7 @@
 )
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_softmax(hexagon_session: Session, shape, dtype, softmax_operation):
     if dtype == "float16":
         pytest.xfail("float16 is not supported.")

From 82086ed6bf347f61b58bac7e6bf93586c85fe9a6 Mon Sep 17 00:00:00 2001
From: Alan MacDonald <alanmacd@users.noreply.github.com>
Date: Tue, 17 May 2022 10:32:58 -0700
Subject: [PATCH 02/59] [docs][microtvm] fix command path in microTVM Reference
 Virtual Machines Running Tests documentation (#11333)

---
 gallery/how_to/work_with_microtvm/micro_reference_vm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gallery/how_to/work_with_microtvm/micro_reference_vm.py b/gallery/how_to/work_with_microtvm/micro_reference_vm.py
index 7733294052821..9eacd9a963e1f 100644
--- a/gallery/how_to/work_with_microtvm/micro_reference_vm.py
+++ b/gallery/how_to/work_with_microtvm/micro_reference_vm.py
@@ -138,12 +138,12 @@
 Running tests
 =============
 
-Once the VM has been provisioned, tests can executed using ``poetry``:
+Once the VM has been provisioned, tests can be executed using ``poetry``:
 
 .. code-block:: bash
 
     $ cd apps/microtvm/reference-vm/zephyr
-    $ poetry run python3 ../../../../tests/micro/qemu/test_zephyr.py --zephyr-board=stm32f746g_disco
+    $ poetry run python3 ../../../../tests/micro/zephyr/test_zephyr.py --zephyr-board=stm32f746g_disco
 
 If you do not have physical hardware attached, but wish to run the tests using the
 local QEMU emulator running within the VM, run the following commands instead:
@@ -152,7 +152,7 @@
 
     $ cd /Users/yourusername/path/to/tvm
     $ cd apps/microtvm/reference-vm/zephyr/
-    $ poetry run pytest ../../../../tests/micro/qemu/test_zephyr.py --zephyr-board=qemu_x86
+    $ poetry run pytest ../../../../tests/micro/zephyr/test_zephyr.py --zephyr-board=qemu_x86
 
 
 

From 1c63c3db86e2c67948189579b71c35af1566edd3 Mon Sep 17 00:00:00 2001
From: ibsidorenko <98739392+ibsidorenko@users.noreply.github.com>
Date: Tue, 17 May 2022 22:50:38 +0300
Subject: [PATCH 03/59] [Runtime][ThreadPool] Enhance CPU Affinity
 configuration for OpenMP case. (#11343)

This commit allows to pin threads to cores when we use OMP. It enhances
`tvm::runtime::threading::Configure` method to work with OMP and "kSpecify"
affinity mode.
---
 src/runtime/thread_pool.cc          | 55 +++++++++++++++++++++++++++++
 tests/cpp/threading_backend_test.cc |  2 +-
 2 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc
index ef1369c7496fb..4692e06734277 100644
--- a/src/runtime/thread_pool.cc
+++ b/src/runtime/thread_pool.cc
@@ -398,6 +398,57 @@ TVM_REGISTER_GLOBAL("runtime.NumThreads").set_body_typed([]() -> int32_t {
 });
 
 namespace threading {
+
+#if TVM_THREADPOOL_USE_OPENMP
+/*!
+ * \brief Helper function that allows to pin threads to cores in case of multi instance execution
+ *        when we use OpenMP thread pool.
+ *
+ * \param mode Affinity mode (now supports only kSpecifyOneCorePerThread and
+ *             kSpecifyThreadShareAllCore).
+ * \param nthreads The number of threads to use (0 = use all).
+ * \param cpus A list of CPU ids to set 'cpu affinity'.
+ *
+ */
+static void ConfigureOMP(tvm::runtime::threading::ThreadGroup::AffinityMode mode, int nthreads,
+                         const std::vector<unsigned int>& cpus) {
+#if defined(__linux__) || defined(__ANDROID__)
+  const int num_workers = MaxConcurrency();
+
+  if (mode == ThreadGroup::kSpecifyOneCorePerThread) {
+#pragma omp parallel num_threads(num_workers)
+    {
+      int core_id = cpus[omp_get_thread_num()];
+      cpu_set_t cpuset;
+      CPU_ZERO(&cpuset);
+      CPU_SET(core_id, &cpuset);
+#if defined(__ANDROID__)
+      sched_setaffinity(pthread_self(), sizeof(cpu_set_t), &cpuset);
+#else
+      pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
+#endif
+    }
+  } else if (mode == ThreadGroup::kSpecifyThreadShareAllCore) {
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+    for (auto id : cpus) {
+      CPU_SET(id, &cpuset);
+    }
+
+#pragma omp parallel num_threads(num_workers)
+    {
+#if defined(__ANDROID__)
+      sched_setaffinity(pthread_self(), sizeof(cpu_set_t), &cpuset);
+#else
+      pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
+#endif
+    }
+  }
+#endif
+}
+
+#endif
+
 void ResetThreadPool() { tvm::runtime::ThreadPool::ThreadLocal()->Reset(); }
 /*!
  * \brief configure the CPU id affinity
@@ -410,7 +461,11 @@ void ResetThreadPool() { tvm::runtime::ThreadPool::ThreadLocal()->Reset(); }
 void Configure(tvm::runtime::threading::ThreadGroup::AffinityMode mode, int nthreads,
                std::vector<unsigned int> cpus) {
   tvm::runtime::threading::SetMaxConcurrency(cpus.size());
+#if !TVM_THREADPOOL_USE_OPENMP
   tvm::runtime::ThreadPool::ThreadLocal()->UpdateWorkerConfiguration(mode, nthreads, cpus);
+#else
+  ConfigureOMP(mode, nthreads, cpus);
+#endif
 }
 int32_t NumThreads() { return tvm::runtime::ThreadPool::ThreadLocal()->NumThreads(); }
 }  // namespace threading
diff --git a/tests/cpp/threading_backend_test.cc b/tests/cpp/threading_backend_test.cc
index db32623531b87..5adf1f9ae36c9 100644
--- a/tests/cpp/threading_backend_test.cc
+++ b/tests/cpp/threading_backend_test.cc
@@ -169,7 +169,7 @@ TEST(ThreadingBackend, TVMBackendAffinityConfigure) {
             std::atomic<size_t> acc(0);
             AffinityCheck ac(thread_pool_index, sys_max_concurrency, &acc);
             std::vector<unsigned int> cpus;
-            std::cout << affinity_mode << std::endl;
+            LOG(INFO) << affinity_mode << std::endl;
             for (int k = 0; k < cpus_num_per_thread; k++) {
               cpus.push_back(thread_pool_index * cpus_num_per_thread + k);
             }

From 2f7d732972f3605bd094609ab9ce5b7d5d80eac9 Mon Sep 17 00:00:00 2001
From: apeskov <peskovnn@gmail.com>
Date: Tue, 17 May 2022 22:51:03 +0300
Subject: [PATCH 04/59] [BYOC] Threadsafe initialization of JSONRuntime module
 (#11339)

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>
---
 src/runtime/contrib/json/json_runtime.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/runtime/contrib/json/json_runtime.h b/src/runtime/contrib/json/json_runtime.h
index 0c6d0f6d71363..374a440e29020 100644
--- a/src/runtime/contrib/json/json_runtime.h
+++ b/src/runtime/contrib/json/json_runtime.h
@@ -88,8 +88,11 @@ class JSONRuntimeBase : public ModuleNode {
       // The function to initialize constant tensors.
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         ICHECK_EQ(args.size(), 1U);
-        this->Init(args[0]);
-        this->initialized_ = true;
+        std::lock_guard<std::mutex> guard(this->initialize_mutex_);
+        if (!this->initialized_) {
+          this->Init(args[0]);
+          this->initialized_ = true;
+        }
         *rv = 0;
       });
     } else {
@@ -270,6 +273,8 @@ class JSONRuntimeBase : public ModuleNode {
   std::vector<uint32_t> const_idx_;
   /*! \brief Indicate if the engine has been initialized. */
   bool initialized_{false};
+  /*! \brief Initializer mutex*/
+  std::mutex initialize_mutex_;
 };
 
 }  // namespace json

From 9b66f66f63a264b6a7a1f50ace29bf1f9e53d43e Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Tue, 17 May 2022 16:24:06 -0400
Subject: [PATCH 05/59] [build] Fix/simplify `ccache` logic (#11189)

- Remove TVM's `USE_CCACHE` option in favor
  of CMake's built-in `CMAKE_C_COMPILER_LAUNCHER`
  and `CMAKE_CXX_COMPILER_LAUNCHER` variables.

  This eliminates a significant source of
  complexity, especially:

  - TVM's CI scripts, which use `sccache`
    instead of `ccache`, and

  - calls to `ExternalProject_add` in TVM's CMake logic.

- Ensure that `CMAKE_C[XX]_COMPILER_LAUNCHER` variables
  are passed through in all `ExternalProject_add` calls.

- Update user documentation.
---
 CMakeLists.txt                  | 29 -----------------------------
 apps/hexagon_api/CMakeLists.txt |  7 +++++++
 cmake/config.cmake              | 12 ------------
 docs/install/from_source.rst    | 16 ++++++++++++++--
 4 files changed, 21 insertions(+), 43 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7023caf97eb59..5352eddd25987 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -766,35 +766,6 @@ if(BUILD_FOR_HEXAGON)
   endif()
 endif()
 
-#Caches the build.
-#Note that ccache-3.x doesn't support nvcc well, so CUDA kernels may never hit the cache and still
-#need to be re-compiled every time. Using ccache 4.0+ can resolve this issue.
-
-if(USE_CCACHE) # True for AUTO, ON, /path/to/ccache
-  if("${USE_CCACHE}" STREQUAL "AUTO") # Auto mode
-    find_program(CCACHE_FOUND ccache)
-    if(CCACHE_FOUND)
-      message(STATUS "Found the path to ccache, enabling ccache")
-      set(PATH_TO_CCACHE ccache)
-    else()
-      message(STATUS "Didn't find the path to CCACHE, disabling ccache")
-    endif(CCACHE_FOUND)
-  elseif("${USE_CCACHE}" MATCHES ${IS_TRUE_PATTERN})
-    find_program(CCACHE_FOUND ccache)
-    if(CCACHE_FOUND)
-      message(STATUS "Found the path to ccache, enabling ccache")
-      set(PATH_TO_CCACHE ccache)
-    else()
-      message(FATAL_ERROR "Cannot find ccache. Set USE_CCACHE mode to AUTO or OFF to build without ccache. USE_CCACHE=" "${USE_CCACHE}")
-    endif(CCACHE_FOUND)
-  else() # /path/to/ccache
-    set(PATH_TO_CCACHE USE_CCACHE)
-    message(STATUS "Setting ccache path to " "${PATH_TO_CCACHE}")
-  endif()
-  # Set the flag for ccache
-  set(CXX_COMPILER_LAUNCHER PATH_TO_CCACHE)
-endif(USE_CCACHE)
-
 find_and_set_linker(${USE_ALTERNATIVE_LINKER})
 
 if(${SUMMARIZE})
diff --git a/apps/hexagon_api/CMakeLists.txt b/apps/hexagon_api/CMakeLists.txt
index 0725b87913a0f..feafff3f98da4 100644
--- a/apps/hexagon_api/CMakeLists.txt
+++ b/apps/hexagon_api/CMakeLists.txt
@@ -42,6 +42,8 @@ ExternalProject_Add(x86_tvm_runtime_rpc
   CMAKE_ARGS
     "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
     "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
+    "-DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER}"
+    "-DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER}"
     "-DUSE_HEXAGON_TOOLCHAIN=${USE_HEXAGON_TOOLCHAIN}"
     "-DCMAKE_CXX_STANDARD=14"
     "-DUSE_LIBBACKTRACE=OFF"
@@ -70,6 +72,8 @@ ExternalProject_Add(android_tvm_runtime_rpc
   SOURCE_DIR "${TVM_SOURCE_DIR}"
   BUILD_COMMAND $(MAKE) runtime tvm_rpc
   CMAKE_ARGS
+    "-DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER}"
+    "-DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER}"
     "-DCMAKE_TOOLCHAIN_FILE=${USE_ANDROID_TOOLCHAIN}"
     "-DANDROID_PLATFORM=${ANDROID_PLATFORM}"
     "-DANDROID_ABI=${ANDROID_ABI}"
@@ -86,6 +90,7 @@ ExternalProject_Add(android_tvm_runtime_rpc
   INSTALL_COMMAND ""
   BUILD_ALWAYS ON
 )
+
 ExternalProject_Get_Property(android_tvm_runtime_rpc BINARY_DIR)
 ExternalProject_Add_Step(android_tvm_runtime_rpc copy_runtime
   COMMAND ${CMAKE_COMMAND} -E copy_if_different
@@ -109,6 +114,8 @@ ExternalProject_Add(hexagon_tvm_runtime_rpc
   SOURCE_DIR "${TVM_SOURCE_DIR}"
   BUILD_COMMAND $(MAKE) runtime hexagon_rpc_sim
   CMAKE_ARGS
+    "-DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER}"
+    "-DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER}"
     "-DCMAKE_C_COMPILER=${USE_HEXAGON_TOOLCHAIN}/bin/hexagon-clang"
     "-DCMAKE_CXX_COMPILER=${USE_HEXAGON_TOOLCHAIN}/bin/hexagon-clang++"
     "-DUSE_HEXAGON_SDK=${USE_HEXAGON_SDK}"
diff --git a/cmake/config.cmake b/cmake/config.cmake
index dc2512175b425..c436c3feaa9fb 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -334,18 +334,6 @@ set(USE_LIBBACKTRACE AUTO)
 # runtime functions to be unavailable to the program.
 set(BUILD_STATIC_RUNTIME OFF)
 
-
-# Caches the build so that building is faster when switching between branches.
-# If you switch branches, build and then encounter a linking error, you may
-# need to regenerate the build tree through "make .." (the cache will
-# still provide significant speedups).
-# Possible values:
-# - AUTO: search for path to ccache, disable if not found.
-# - ON: enable ccache by searching for the path to ccache, report an error if not found
-# - OFF: disable ccache
-# - /path/to/ccache: use specific path to ccache
-set(USE_CCACHE AUTO)
-
 # Whether to enable PAPI support in profiling. PAPI provides access to hardware
 # counters while profiling.
 # Possible values:
diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 5fb48cb0e54f1..8597de224cd9f 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -109,7 +109,7 @@ The configuration of TVM can be modified by editing `config.cmake` and/or by pas
 
           export TVM_LOG_DEBUG="ir/transform.cc=1;relay/ir/transform.cc=1"
 
-- TVM requires LLVM for for CPU codegen. We highly recommend you to build with the LLVM support on.
+- TVM requires LLVM for CPU codegen. We highly recommend you to build with the LLVM support on.
 
   - LLVM 4.0 or higher is needed for build with LLVM. Note that version of LLVM from default apt may lower than 4.0.
   - Since LLVM takes long time to build from source, you can download pre-built version of LLVM from
@@ -126,6 +126,18 @@ The configuration of TVM can be modified by editing `config.cmake` and/or by pas
   - If you are a PyTorch user, it is recommended to set ``(USE_LLVM "/path/to/llvm-config --link-static")`` and ``set(HIDE_PRIVATE_SYMBOLS ON)``
     to avoid potential symbol conflicts between different versions LLVM used by TVM and PyTorch.
 
+  - On supported platforms, the `Ccache compiler wrapper <https://ccache.dev/>`_ may be helpful for
+    reducing TVM's build time.  There are several ways to enable CCache in TVM builds:
+
+    - Ccache's Masquerade mode. This is typically enabled during the Ccache installation process.
+      To have TVM use Ccache in masquerade, simply specify the appropriate C/C++ compiler
+      paths when configuring TVM's build system.  For example:
+      ``cmake -DCMAKE_CXX_COMPILER=/usr/lib/ccache/c++ ...``.
+
+    - Ccache as CMake's C++ compiler prefix.  When configuring TVM's build system,
+      set the CMake variable ``CMAKE_CXX_COMPILER_LAUNCHER`` to an appropriate value.
+      E.g. ``cmake -DCMAKE_CXX_COMPILER_LAUNCHER=ccache ...``.
+
 - We can then build tvm and related libraries.
 
   .. code:: bash
@@ -315,7 +327,7 @@ configuration. A workaround for this is to do the following commands:
 
         brew install openblas gfortran
 
-        pip install pybind11 cython pythran  
+        pip install pybind11 cython pythran
 
         export OPENBLAS=/opt/homebrew/opt/openblas/lib/
 

From 1bde845814dd751d11659c3ba6781a6ffc4ede45 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 17 May 2022 13:37:36 -0700
Subject: [PATCH 06/59] [ci] Use r5.large nodes for hexagon build and some
 tests (#11120)

* PR #11314 - [ci][docker] Update images to include sccache changes

* [ci] Use r5.large nodes for less-intensive jobs

This uses the `CPU-SMALL` label for certain jobs in CI, which is backed by r5.large instances in EC2 rather than c4.4xlarge instances which are much more expensive

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                                | 22 +++++++++++-----------
 jenkins/Jenkinsfile.j2                     | 21 ++++++++++++---------
 tests/scripts/ci.py                        | 14 +++++++++++---
 tests/scripts/task_build.py                | 12 +++++++-----
 tests/scripts/task_config_build_hexagon.sh |  2 +-
 tests/scripts/task_lint.sh                 |  3 ---
 6 files changed, 42 insertions(+), 32 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index c0fb3f5df20c5..6fcdc3cd4a159 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-17T10:32:14.621387
+// Generated at 2022-05-17T09:16:58.363027
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -688,7 +688,7 @@ stage('Build') {
   },
   'BUILD: Hexagon': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-hexagon") {
           init_git()
           sh (
@@ -845,7 +845,7 @@ stage('Test') {
   },
   'unittest: CPU': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-cpu") {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
@@ -873,7 +873,7 @@ stage('Test') {
   },
   'python: i386 1 of 3': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
           try {
             init_git()
@@ -904,7 +904,7 @@ stage('Test') {
   },
   'python: i386 2 of 3': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
           try {
             init_git()
@@ -934,7 +934,7 @@ stage('Test') {
   },
   'python: i386 3 of 3': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
           try {
             init_git()
@@ -964,7 +964,7 @@ stage('Test') {
   },
   'test: Hexagon 1 of 4': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
           try {
             init_git()
@@ -997,7 +997,7 @@ stage('Test') {
   },
   'test: Hexagon 2 of 4': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
           try {
             init_git()
@@ -1029,7 +1029,7 @@ stage('Test') {
   },
   'test: Hexagon 3 of 4': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
           try {
             init_git()
@@ -1061,7 +1061,7 @@ stage('Test') {
   },
   'test: Hexagon 4 of 4': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
           try {
             init_git()
@@ -1093,7 +1093,7 @@ stage('Test') {
   },
   'test: QEMU': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-qemu") {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 3b2ca5d711038..0264a526e7b56 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -605,7 +605,7 @@ stage('Build') {
   },
   'BUILD: Hexagon': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-hexagon') }}) {
           init_git()
           sh (
@@ -681,7 +681,8 @@ stage('Test') {
   {% endcall %}
   {% call m.test_step(
     name="unittest: CPU",
-    node="CPU", ws="tvm/ut-python-cpu",
+    node="CPU-SMALL",
+    ws="tvm/ut-python-cpu",
     platform="cpu",
   ) %}
     unpack_lib('cpu', tvm_multilib_tsim)
@@ -696,11 +697,11 @@ stage('Test') {
   {% endcall %}
   {% call(shard_index, num_shards) m.sharded_test_step(
     name="python: i386",
-    node="CPU",
-      num_shards=3,
-      ws="tvm/integration-python-i386",
-      platform="i386",
-    ) %}
+    node="CPU-SMALL",
+    num_shards=3,
+    ws="tvm/integration-python-i386",
+    platform="i386",
+  ) %}
     unpack_lib('i386', tvm_multilib)
     ci_setup(ci_i386)
     {% if shard_index == 1 %}
@@ -715,7 +716,8 @@ stage('Test') {
   {% endcall %}
   {% call(shard_index, num_shards) m.sharded_test_step(
     name="test: Hexagon",
-    node="CPU", ws="tvm/test-hexagon",
+    node="CPU-SMALL",
+    ws="tvm/test-hexagon",
     platform="hexagon",
     num_shards=4,
   ) %}
@@ -735,7 +737,8 @@ stage('Test') {
   {% endcall %}
   {% call m.test_step(
     name="test: QEMU",
-    node="CPU", ws="tvm/test-qemu",
+    node="CPU-SMALL",
+    ws="tvm/test-qemu",
     platform="qemu",
   ) %}
     unpack_lib('qemu', tvm_lib)
diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index 22e6690beb116..d45c3b1ae9cb2 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -561,11 +561,14 @@ def add_subparser(
     return subparser
 
 
+CPP_UNITTEST = ("run c++ unitests", ["./tests/scripts/task_cpp_unittest.sh"])
+
 generated = [
     generate_command(
         name="gpu",
         help="Run GPU build and test(s)",
         options={
+            "cpp": CPP_UNITTEST,
             "topi": ("run topi tests", ["./tests/scripts/task_python_topi.sh"]),
             "unittest": (
                 "run unit tests",
@@ -582,6 +585,7 @@ def add_subparser(
         name="cpu",
         help="Run CPU build and test(s)",
         options={
+            "cpp": CPP_UNITTEST,
             "integration": (
                 "run integration tests",
                 ["./tests/scripts/task_python_integration.sh"],
@@ -601,6 +605,7 @@ def add_subparser(
         name="i386",
         help="Run i386 build and test(s)",
         options={
+            "cpp": CPP_UNITTEST,
             "integration": (
                 "run integration tests",
                 [
@@ -619,26 +624,28 @@ def add_subparser(
         name="qemu",
         help="Run QEMU build and test(s)",
         options={
+            "cpp": CPP_UNITTEST,
             "test": (
                 "run microTVM tests",
                 [
                     "./tests/scripts/task_python_microtvm.sh",
                     "./tests/scripts/task_demo_microtvm.sh",
                 ],
-            )
+            ),
         },
     ),
     generate_command(
         name="hexagon",
         help="Run Hexagon build and test(s)",
         options={
+            "cpp": CPP_UNITTEST,
             "test": (
                 "run Hexagon API/Python tests",
                 [
                     "./tests/scripts/task_build_hexagon_api.sh",
                     "./tests/scripts/task_python_hexagon.sh",
                 ],
-            )
+            ),
         },
     ),
     generate_command(
@@ -646,13 +653,14 @@ def add_subparser(
         help="Run ARM build and test(s) (native or via QEMU on x86)",
         precheck=check_arm_qemu,
         options={
+            "cpp": CPP_UNITTEST,
             "python": (
                 "run full Python tests",
                 [
                     "./tests/scripts/task_python_unittest.sh",
                     "./tests/scripts/task_python_arm_compute_library.sh",
                 ],
-            )
+            ),
         },
     ),
 ]
diff --git a/tests/scripts/task_build.py b/tests/scripts/task_build.py
index 52b7dd421b461..e4583fe6af041 100755
--- a/tests/scripts/task_build.py
+++ b/tests/scripts/task_build.py
@@ -37,21 +37,22 @@
     env = {"VTA_HW_PATH": str(Path(os.getcwd()) / "3rdparty" / "vta-hw")}
     sccache_exe = shutil.which("sccache")
 
-    use_sccache = sccache_exe is not None and args.sccache_bucket is not None
+    use_sccache = sccache_exe is not None
     build_dir = Path(os.getcwd()) / args.build_dir
     build_dir = build_dir.relative_to(REPO_ROOT)
 
     if use_sccache:
-        env["SCCACHE_BUCKET"] = args.sccache_bucket
+        if args.sccache_bucket:
+            env["SCCACHE_BUCKET"] = args.sccache_bucket
+            logging.info(f"Using sccache bucket: {args.sccache_bucket}")
+        else:
+            logging.info(f"No sccache bucket set, using local cache")
         env["CXX"] = "/opt/sccache/c++"
         env["CC"] = "/opt/sccache/cc"
 
-        logging.info(f"Using sccache bucket: {args.sccache_bucket}")
     else:
         if sccache_exe is None:
             reason = "'sccache' executable not found"
-        elif args.sccache_bucket is None:
-            reason = "'sccache' executable not found"
         else:
             reason = "<unknown>"
         logging.info(f"Not using sccache, reason: {reason}")
@@ -71,6 +72,7 @@
     num_cpus = max(available_cpus, 1)
 
     sh.run("cmake -GNinja -DCMAKE_BUILD_TYPE=RelWithDebInfo ..", cwd=build_dir)
+
     target = ""
     if args.cmake_target:
         target = args.cmake_target
diff --git a/tests/scripts/task_config_build_hexagon.sh b/tests/scripts/task_config_build_hexagon.sh
index c298800fcd4ed..7bce64cddb5a9 100755
--- a/tests/scripts/task_config_build_hexagon.sh
+++ b/tests/scripts/task_config_build_hexagon.sh
@@ -29,7 +29,7 @@ echo set\(USE_RPC ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
 echo set\(USE_LLVM "${CLANG_LLVM_HOME}/bin/llvm-config"\) >> config.cmake
-echo set\(CMAKE_CXX_COMPILER "${CLANG_LLVM_HOME}/bin/clang++"\) >> config.cmake
+echo set\(CMAKE_CXX_COMPILER "/opt/sccache/clang++"\) >> config.cmake
 echo set\(USE_HEXAGON "ON"\) >> config.cmake
 echo set\(USE_HEXAGON_SDK "${HEXAGON_SDK_PATH}"\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index e0c953d61841a..8fbba52662de8 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -31,9 +31,6 @@ function shard1 {
   echo "Convert scripts to Python..."
   tests/scripts/task_convert_scripts_to_python.sh
 
-  # TODO: Remove this ad-hoc pip install once https://github.com/apache/tvm/pull/11265
-  # is added to the ci_lint Docker image
-  python3 -m pip install --user -r jenkins/requirements.txt
   echo "Check Jenkinsfile generation"
   python3 jenkins/generate.py --check
 

From 0705bd765037088eca803b7ac80c8e9d83c06ab2 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Tue, 17 May 2022 13:53:20 -0700
Subject: [PATCH 07/59] [Hexagon][Docker] Update image version (#11332)

---
 Jenkinsfile            | 2 +-
 jenkins/Jenkinsfile.j2 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 6fcdc3cd4a159..4db9a45e2e5c6 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -56,7 +56,7 @@ ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e'
 ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e'
 ci_qemu = 'tlcpack/ci-qemu:20220517-094028-de21c8f2e'
 ci_arm = 'tlcpack/ci-arm:20220513-055910-fa834f67e'
-ci_hexagon = 'tlcpack/ci-hexagon:20220513-055910-fa834f67e'
+ci_hexagon = 'tlcpack/ci-hexagon:20220516-190055-672ce3365'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 0264a526e7b56..88ced73a8f971 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -58,7 +58,7 @@ ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e'
 ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e'
 ci_qemu = 'tlcpack/ci-qemu:20220517-094028-de21c8f2e'
 ci_arm = 'tlcpack/ci-arm:20220513-055910-fa834f67e'
-ci_hexagon = 'tlcpack/ci-hexagon:20220513-055910-fa834f67e'
+ci_hexagon = 'tlcpack/ci-hexagon:20220516-190055-672ce3365'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images

From 0e2f869eeadbb349f849ed2add86a622e97053cd Mon Sep 17 00:00:00 2001
From: czh978 <41666381+czh978@users.noreply.github.com>
Date: Wed, 18 May 2022 05:08:08 +0800
Subject: [PATCH 08/59] logsoftmax reusing the softmax function (#11141)

Co-authored-by: caizihua <978497756@qq.com>
---
 python/tvm/relay/frontend/onnx.py | 25 +++++++------------------
 1 file changed, 7 insertions(+), 18 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 81f12c2d81036..e68daca4c4f0f 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -2412,30 +2412,18 @@ class LogSoftmax(OnnxOpConverter):
     """Operator converter for Softmax."""
 
     @classmethod
-    def run_calculation(cls, x, axes):
+    def run_calculation(cls, inputs, attr, params, opset):
         """Run the calculation for Log Softmax calculation."""
-        m = _op.max(x, axes, keepdims=True)
-        e = _op.exp(x - m)
-        s = _op.sum(e, axes, keepdims=True)
-        return x - m - _op.log(s)
+        res = Softmax.get_converter(opset)(inputs, attr, params)
+        return _op.log(res)
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
-        axis = attr.get("axis", 1)
-        ndim = len(infer_shape(inputs[0]))
-        if axis < 0:
-            axis += ndim
-        axes = list(range(axis, ndim))
-        return cls.run_calculation(inputs[0], axes)
+        return cls.run_calculation(inputs, attr, params, opset=1)
 
     @classmethod
     def _impl_v13(cls, inputs, attr, params):
-        axis = attr.get("axis", -1)
-        ndim = len(infer_shape(inputs[0]))
-        if axis < 0:
-            axis += ndim
-        axes = [axis]
-        return cls.run_calculation(inputs[0], axes)
+        return cls.run_calculation(inputs, attr, params, opset=13)
 
 
 class Hardmax(OnnxOpConverter):
@@ -4852,7 +4840,8 @@ def _impl_v13(cls, inputs, attr, params):
             weight_tensor = None
 
         get_log_prob = attr["tvm_custom"]["num_outputs"] == 2
-        log_softmax_tensor = LogSoftmax.run_calculation(input_tensor, axes=[1])
+        log_softmax_attr = {"axis": 1}
+        log_softmax_tensor = LogSoftmax.get_converter(13)([input_tensor], log_softmax_attr, None)
 
         loss, weight_total = NegativeLogLikelihoodLoss.run_calculation(
             log_softmax_tensor,

From 75c31cae75fe31af9e0901210ba7fa597e6f153a Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@octoml.ai>
Date: Tue, 17 May 2022 16:17:48 -0700
Subject: [PATCH 09/59] [Relay] Bug fix when applying history using an iterator
 or records. (#11306)

* Bug fix when applying history using an iterator or records.

* I forgot strings are iterables.
---
 python/tvm/auto_scheduler/dispatcher.py          | 3 ++-
 python/tvm/autotvm/task/dispatcher.py            | 5 +++--
 tests/python/relay/test_auto_scheduler_tuning.py | 7 +++++++
 tests/python/unittest/test_autotvm_record.py     | 5 +++++
 4 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/python/tvm/auto_scheduler/dispatcher.py b/python/tvm/auto_scheduler/dispatcher.py
index eceeba38e0810..98566f8636507 100644
--- a/python/tvm/auto_scheduler/dispatcher.py
+++ b/python/tvm/auto_scheduler/dispatcher.py
@@ -25,6 +25,7 @@
 
 import logging
 import pathlib
+from collections.abc import Iterable
 
 import numpy as np
 
@@ -199,7 +200,7 @@ def load(self, records, n_lines=None):
             if it is not None, only load the first `n_lines` lines of log
         """
         joint_records = []
-        if not isinstance(records, (list, tuple)):
+        if not isinstance(records, Iterable) or isinstance(records, str):
             records = [records]
 
         for rec in records:
diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py
index ffff50b9dc0b3..6c072dc1fa17b 100644
--- a/python/tvm/autotvm/task/dispatcher.py
+++ b/python/tvm/autotvm/task/dispatcher.py
@@ -31,6 +31,7 @@
 from __future__ import absolute_import as _abs
 
 import logging
+from collections.abc import Iterable
 
 import numpy as np
 
@@ -212,7 +213,7 @@ def load(self, records):
             Collection of tuning records.
             If is str, then it should be the filename of a records log file.
             Each row of this file is an encoded record pair. If it is a list
-            it can either be a list of paths to logs that will loaded jointly or
+            it can either be a list of paths to logs that will be loaded jointly or
             an iterator of measurement results.
         """
         # pylint: disable=import-outside-toplevel
@@ -220,7 +221,7 @@ def load(self, records):
         from ..record import load_from_file
 
         joint_records = []
-        if not isinstance(records, (list, tuple)):
+        if not isinstance(records, Iterable) or isinstance(records, str):
             records = [records]
 
         for rec in records:
diff --git a/tests/python/relay/test_auto_scheduler_tuning.py b/tests/python/relay/test_auto_scheduler_tuning.py
index c9ce5b59ff09b..735486ef27c68 100644
--- a/tests/python/relay/test_auto_scheduler_tuning.py
+++ b/tests/python/relay/test_auto_scheduler_tuning.py
@@ -62,6 +62,13 @@ def tune_network(network, target):
                 best, auto_scheduler.dispatcher.ApplyHistoryBest
             ), "Unable to load multiple log files jointly."
 
+        # Confirm iterables can be directly loaded.
+        loaded_recs = auto_scheduler.dispatcher.load_records(log_file)
+        with auto_scheduler.ApplyHistoryBest(iter(loaded_recs)) as best:
+            assert isinstance(
+                best, auto_scheduler.dispatcher.ApplyHistoryBest
+            ), "Unable to ingest logs from an interator."
+
         # Sample a schedule when missing
         with auto_scheduler.ApplyHistoryBestOrSample(None, num_measure=2):
             with tvm.transform.PassContext(
diff --git a/tests/python/unittest/test_autotvm_record.py b/tests/python/unittest/test_autotvm_record.py
index 2ee75cf18c0e5..147122ff10d6e 100644
--- a/tests/python/unittest/test_autotvm_record.py
+++ b/tests/python/unittest/test_autotvm_record.py
@@ -91,6 +91,11 @@ def test_apply_history_best():
     x = hist_best.query(target, tsk.workload)
     assert str(x) == str(tsk.config_space.get(2))
 
+    # Confirm same functionality for iterators.
+    hist_best = ApplyHistoryBest(iter(records))
+    x = hist_best.query(target, tsk.workload)
+    assert str(x) == str(tsk.config_space.get(2))
+
 
 if __name__ == "__main__":
     test_load_dump()

From f755c97492c7e851277b9fc52854afeb18e14952 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 17 May 2022 16:24:48 -0700
Subject: [PATCH 10/59] [skip ci][ci][docker] Pin Pillow version (#11348)

A recent release depends on some things we don't have installed, so don't use it.

e.g. https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/PR-11319/5/pipeline/

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 docker/install/ubuntu_install_python_package.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index 4f99f17842386..0353814efcb83 100755
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -30,7 +30,7 @@ pip3 install --upgrade \
     numpy~=1.19.5 \
     orderedset \
     packaging \
-    Pillow \
+    Pillow==9.1.0 \
     psutil \
     pytest \
     tlcpack-sphinx-addon==0.2.1 \

From 9c27ff5e58bb5ceccbc8a5855689da0cb59dac79 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 17 May 2022 23:22:54 -0700
Subject: [PATCH 11/59] [ci] Bump job timeout to 3 hours (#11350)

This is intended to be temporary to avoid timeouts on jobs while we work on getting some things under control like artifact upload time and shards for various jobs.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile            | 4 ++--
 jenkins/Jenkinsfile.j2 | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 4db9a45e2e5c6..424f97494d767 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-17T09:16:58.363027
+// Generated at 2022-05-17T17:26:21.660243
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -83,7 +83,7 @@ upstream_revision = null
 docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM'
 docker_build = 'docker/build.sh'
 // timeout in minutes
-max_time = 120
+max_time = 180
 rebuild_docker_images = false
 
 def per_exec_ws(folder) {
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 88ced73a8f971..f250ff12feed0 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -80,7 +80,7 @@ upstream_revision = null
 docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM'
 docker_build = 'docker/build.sh'
 // timeout in minutes
-max_time = 120
+max_time = 180
 rebuild_docker_images = false
 
 def per_exec_ws(folder) {

From b5e1fdd3ddb47b097be36c44a8c8de2b305ecd2b Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Wed, 18 May 2022 01:16:36 -0700
Subject: [PATCH 12/59] Improve error messages with TVM_LOG_DEBUG and add docs
 (#11344)

* Improve error messages with TVM_LOG_DEBUG and add docs.

* Fix requirement to prepend "src" with /.
---
 docs/dev/how_to/debugging_tvm.rst | 72 +++++++++++++++++++++++++++++++
 docs/dev/how_to/how_to.rst        |  1 +
 src/runtime/logging.cc            | 26 +++++++++--
 tests/cpp/runtime/logging_test.cc | 29 +++++++++++--
 4 files changed, 121 insertions(+), 7 deletions(-)
 create mode 100644 docs/dev/how_to/debugging_tvm.rst

diff --git a/docs/dev/how_to/debugging_tvm.rst b/docs/dev/how_to/debugging_tvm.rst
new file mode 100644
index 0000000000000..6060f797b3e4b
--- /dev/null
+++ b/docs/dev/how_to/debugging_tvm.rst
@@ -0,0 +1,72 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+.. _debugging-tvm:
+
+Debuggging TVM
+==============
+
+**NOTE**: This page is a work in-progress. Everyone is welcomed to add suggestions and tips via
+sending a PR to modify this page. The goal with this page is to centralize the commonly-used
+techniques being used to debug TVM and to spread awareness to the community. To that end, we may
+seek to promote more broadly-used techniques to the top of this doc.
+
+VLOGging
+--------
+
+TVM provides a verbose-logging facility that allows you to commit trace-level debugging messages
+without impacting the binary size or runtime of TVM in production. You can use VLOG in your code
+as follows:
+
+.. code-block:: c++
+
+    void Foo(const std::string& bar) {
+      VLOG(2) << "Running Foo(" << bar << ")";
+      // ...
+    }
+
+In this example, the integer ``2`` passed to ``VLOG()`` indicates a verbosity level. The higher the
+level, the more logs printed. In general, TVM levels range from 0 to 2, with 3 being used only for
+extremely low-level core runtime properties. The VLOG system is configured at startup time to print
+VLOG statements between ``0`` and some integer ``N``. ``N`` can be set per-file or globally.
+
+VLOGs don't print or impact binary size or runtime by default (when compiled with proper
+optimization). To enable VLOGging, do the following:
+
+1. In ``config/cmake``, ensure you ``set(USE_RELAY_DEBUG ON)``. This flag is used to enable
+   VLOGging.
+2. Launch Python passing ``TVM_LOG_DEBUG=<spec>``, where ``<spec>>`` is a comma-separated list of
+   level assignments of the form ``<file_name>=<level>``. Here are some specializations:
+
+    - The special filename ``DEFAULT`` sets the VLOG level setting for all files.
+    - ``<level>>`` can be set to ``-1`` to disable VLOG in that file.
+    - ``<file_name>`` is the name of the c++ source file (e.g. ``.cc``, not ``.h``) relative to the
+      ``src/`` directory in the TVM repo. You do not need to supply ``src/`` when specifying the
+      file path, but if you do, VLOG will still interpret the path correctly.
+
+Examples:
+
+.. code-block: shell
+
+   # enable VLOG(0), VLOG(1), VLOG(2) in all files.
+   $ TVM_LOG_DEBUG=DEFAULT=2 python3 -c 'import tvm'
+
+   # enable VLOG(0), VLOG(1), VLOG(2) in all files, except not VLOG(2) in src/bar/baz.cc.
+   $ TVM_LOG_DEBUG=DEFAULT=2,bar/baz.cc=1 python3 -c 'import tvm'
+
+   # enable VLOG(0), VLOG(1), VLOG(2) in all files, except not in src/foo/bar.cc.
+   $ TVM_LOG_DEBUG=DEFAULT=2,src/foo/bar.cc=-1 python3 -c 'import tvm'
diff --git a/docs/dev/how_to/how_to.rst b/docs/dev/how_to/how_to.rst
index 844ae0ad527e5..67bb94b007c4e 100644
--- a/docs/dev/how_to/how_to.rst
+++ b/docs/dev/how_to/how_to.rst
@@ -25,6 +25,7 @@ various areas of the TVM stack.
 .. toctree::
    :maxdepth: 1
 
+   debugging_tvm
    relay_add_op
    relay_add_pass
    relay_bring_your_own_codegen
diff --git a/src/runtime/logging.cc b/src/runtime/logging.cc
index 0f614a7eaff19..c6c756d85c7e7 100644
--- a/src/runtime/logging.cc
+++ b/src/runtime/logging.cc
@@ -197,6 +197,12 @@ std::string FileToVLogMapKey(const std::string& filename) {
   // Canonicalize the filename.
   // TODO(mbs): Not Windows friendly.
   size_t last_src = filename.rfind(kSrcPrefix, std::string::npos, kSrcPrefixLength);
+  if (last_src == std::string::npos) {
+    std::string no_slash_src{kSrcPrefix + 1};
+    if (filename.substr(0, no_slash_src.size()) == no_slash_src) {
+      return filename.substr(no_slash_src.size());
+    }
+  }
   // Strip anything before the /src/ prefix, on the assumption that will yield the
   // TVM project relative filename. If no such prefix fallback to filename without
   // canonicalization.
@@ -222,6 +228,15 @@ TvmLogDebugSettings TvmLogDebugSettings::ParseSpec(const char* opt_spec) {
     return settings;
   }
   std::istringstream spec_stream(spec);
+  auto tell_pos = [&](const std::string& last_read) {
+    int pos = spec_stream.tellg();
+    if (pos == -1) {
+      LOG(INFO) << "override pos: " << last_read;
+      // when pos == -1, failbit was set due to std::getline reaching EOF without seeing delimiter.
+      pos = spec.size() - last_read.size();
+    }
+    return pos;
+  };
   while (spec_stream) {
     std::string name;
     if (!std::getline(spec_stream, name, '=')) {
@@ -229,7 +244,7 @@ TvmLogDebugSettings TvmLogDebugSettings::ParseSpec(const char* opt_spec) {
       break;
     }
     if (name.empty()) {
-      LOG(FATAL) << "TVM_LOG_DEBUG ill-formed, empty name";
+      LOG(FATAL) << "TVM_LOG_DEBUG ill-formed at position " << tell_pos(name) << ": empty filename";
       return settings;
     }
 
@@ -237,18 +252,21 @@ TvmLogDebugSettings TvmLogDebugSettings::ParseSpec(const char* opt_spec) {
 
     std::string level;
     if (!std::getline(spec_stream, level, ',')) {
-      LOG(FATAL) << "TVM_LOG_DEBUG ill-formed, expecting level";
+      LOG(FATAL) << "TVM_LOG_DEBUG ill-formed at position " << tell_pos(level)
+                 << ": expecting \"=<level>\" after \"" << name << "\"";
       return settings;
     }
     if (level.empty()) {
-      LOG(FATAL) << "TVM_LOG_DEBUG ill-formed, empty level";
+      LOG(FATAL) << "TVM_LOG_DEBUG ill-formed at position " << tell_pos(level)
+                 << ": empty level after \"" << name << "\"";
       return settings;
     }
     // Parse level, default to 0 if ill-formed which we don't detect.
     char* end_of_level = nullptr;
     int level_val = static_cast<int>(strtol(level.c_str(), &end_of_level, 10));
     if (end_of_level != level.c_str() + level.size()) {
-      LOG(FATAL) << "TVM_LOG_DEBUG ill-formed, invalid level";
+      LOG(FATAL) << "TVM_LOG_DEBUG ill-formed at position " << tell_pos(level)
+                 << ": invalid level: \"" << level << "\"";
       return settings;
     }
     LOG(INFO) << "TVM_LOG_DEBUG enables VLOG statements in '" << name << "' up to level " << level;
diff --git a/tests/cpp/runtime/logging_test.cc b/tests/cpp/runtime/logging_test.cc
index ae5140ed1815f..e707606843bf3 100644
--- a/tests/cpp/runtime/logging_test.cc
+++ b/tests/cpp/runtime/logging_test.cc
@@ -17,6 +17,7 @@
  * under the License.
  */
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include <tvm/runtime/logging.h>
 
@@ -60,17 +61,39 @@ TEST(TvmLogDebugSettings, VLogEnabledComplex) {
   EXPECT_FALSE(settings.VerboseEnabled("my/filesystem/src/baz.cc", 0));
 }
 
+#define MATCH_THROW(stmt, err_type, matcher)            \
+  try {                                                 \
+    stmt;                                               \
+  } catch (const err_type& e) {                         \
+    EXPECT_THAT(e.what(), matcher);                     \
+  } catch (...) {                                       \
+    EXPECT_FALSE("stmt threw an unexpected exception"); \
+  }
+
 TEST(TvmLogDebugSettings, IllFormed) {
-  EXPECT_THROW(TvmLogDebugSettings::ParseSpec("foo/bar.cc=bogus;"), InternalError);
+  MATCH_THROW(
+      TvmLogDebugSettings::ParseSpec("foo/bar.cc=bogus;"), InternalError,
+      ::testing::HasSubstr("TVM_LOG_DEBUG ill-formed at position 11: invalid level: \"bogus;\""));
+
+  MATCH_THROW(TvmLogDebugSettings::ParseSpec("DEFAULT=2;bar/baz.cc=2"), InternalError,
+              ::testing::HasSubstr(
+                  "TVM_LOG_DEBUG ill-formed at position 8: invalid level: \"2;bar/baz.cc=2\""));
+
+  MATCH_THROW(TvmLogDebugSettings::ParseSpec("DEFAULT=2,bar/baz.cc+2"), InternalError,
+              ::testing::HasSubstr("TVM_LOG_DEBUG ill-formed at position 22: expecting "
+                                   "\"=<level>\" after \"bar/baz.cc+2\""));
 }
 
 TEST(TvmLogDebugSettings, SpecPrefix) {
   TvmLogDebugSettings settings = TvmLogDebugSettings::ParseSpec(
-      "../src/foo/bar.cc=3,src/baz.cc=-1,foo/bar/src/another/file.cc=4");
+      "../src/foo/bar.cc=3,src/baz.cc=3,foo/bar/src/another/file.cc=4");
   EXPECT_TRUE(settings.dlog_enabled());
   EXPECT_TRUE(settings.VerboseEnabled("my/filesystem/src/foo/bar.cc", 3));
-  EXPECT_FALSE(settings.VerboseEnabled("my/filesystem/src/baz.cc", 0));
+  EXPECT_TRUE(settings.VerboseEnabled("foo/bar.cc", 3));
+  EXPECT_TRUE(settings.VerboseEnabled("my/filesystem/src/baz.cc", 3));
+  EXPECT_TRUE(settings.VerboseEnabled("baz.cc", 3));
   EXPECT_TRUE(settings.VerboseEnabled("my/filesystem/src/another/file.cc", 4));
+  EXPECT_TRUE(settings.VerboseEnabled("another/file.cc", 4));
 }
 
 }  // namespace

From a4be2ed9046a97fa826da9beba64c791e2c36ccf Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Wed, 18 May 2022 17:56:10 +0900
Subject: [PATCH 13/59] [TVMScript] Support inlined function call as a sugar
 (#11324)

* [TVMScript] Support function call to help construct AST

* add test

* update test

* more comment

* fix for avoiding Buffer.vload(...) case

* update parse error msg

* wrap func call with try / catch, emit error msg

* silence pylint
---
 python/tvm/script/parser.py                   | 44 +++++++++-
 .../unittest/test_tvmscript_syntax_sugar.py   | 81 +++++++++++++++++++
 2 files changed, 121 insertions(+), 4 deletions(-)

diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py
index fe71b064320f9..daeb018ea9899 100644
--- a/python/tvm/script/parser.py
+++ b/python/tvm/script/parser.py
@@ -20,7 +20,8 @@
 different python versions. Synr also provides an error handling context that we
 use for error reporting.
 """
-# pylint: disable=invalid-name, inconsistent-return-statements, no-else-return
+# pylint: disable=invalid-name, inconsistent-return-statements, no-else-return, broad-except
+import types
 import json
 import operator
 import inspect
@@ -543,7 +544,7 @@ def transform_Assign(self, node):
         AST abstract grammar:
             Assign(expr* targets, expr value, string? type_comment)
 
-        By now 3 patterns of Assign is supported:
+        By now 5 patterns of Assign is supported:
             1. special stmts with return value
                 1.1 Buffer = T.match_buffer()/T.buffer_decl()
                 1.2 Var = T.var()
@@ -552,6 +553,9 @@ def transform_Assign(self, node):
             3. (Store)       Var[PrimExpr] = PrimExpr
             4. with scope handlers with concise scoping and var def
                 4.1 var = T.allocate()
+            5. A call to a pure python function, consuming and producing TVMScript values.
+               The outputs are inlined into the following body (no variable is created).
+               x, y = f(...)
         """
 
         if isinstance(node.rhs, ast.Call):
@@ -577,6 +581,35 @@ def transform_Assign(self, node):
                 arg_list = self.parse_arg_list(func, node.rhs)
                 func.handle(node, self.context, arg_list, node.rhs.func_name.span)
                 return self.parse_body(node)
+            elif isinstance(func, types.FunctionType):
+                # Pattern 5
+                args = [self.transform(arg) for arg in node.rhs.params]
+                try:
+                    out = func(*args)
+                except Exception as e:
+                    self.report_error(
+                        "Error occured when invoking the function "
+                        + func.__name__
+                        + ": \n"
+                        + str(e),
+                        node.rhs.span,
+                    )
+
+                if len(node.lhs) == 1 and not isinstance(out, list):
+                    out = [out]
+
+                assert len(out) == len(node.lhs)
+
+                for var, value in zip(node.lhs, out):
+                    self.context.update_symbol(var.id.name, value, node)
+
+                body = self.parse_body(node)
+
+                for var, value in zip(node.lhs, out):
+                    self.context.remove_symbol(var.id.name)
+
+                return body
+
         if isinstance(node.rhs, (ast.Call, ast.Constant)):
             # Pattern 4 of let binding
             value = self.transform(node.rhs)
@@ -606,7 +639,7 @@ def transform_Assign(self, node):
             return tvm.tir.LetStmt(var, value, body, span=tvm_span_from_synr(node.span))
 
         self.report_error(
-            """Assignments should be either
+            """Assignments should be one of:
             1. A "special statement" with return value
                 1.1 Buffer = T.match_buffer()/T.buffer_decl()
                 1.2 Var = T.var()
@@ -614,7 +647,10 @@ def transform_Assign(self, node):
             2. A store into a buffer: Buffer[PrimExpr, PrimExpr, ..., PrimExpr] = PrimExpr
             3. A store into a variable: Var[PrimExpr] = PrimExpr
             4. A with scope handler with concise scoping and var def
-                4.1 var = T.allocate()""",
+                4.1 var = T.allocate()
+            5. The right-hand side being a call to a pure python function, consuming and
+               producing TVMScript values.
+               x, y = f(...)""",
             node.span,
         )
 
diff --git a/tests/python/unittest/test_tvmscript_syntax_sugar.py b/tests/python/unittest/test_tvmscript_syntax_sugar.py
index a0964ea4d77ce..b3fe5674a8736 100644
--- a/tests/python/unittest/test_tvmscript_syntax_sugar.py
+++ b/tests/python/unittest/test_tvmscript_syntax_sugar.py
@@ -265,5 +265,86 @@ def constant_binds_wrapped():
     assert_structural_equal(constant_binds, constant_binds_wrapped)
 
 
+def test_func_call():
+    def shared_16x16_to_ldmatrix_32x8_layout(i, j):
+        thread_id = (i % 8) * 4 + (j % 8) // 2
+        return thread_id, (j // 8) * 4 + (i // 8) * 2 + (j % 2)
+
+    @T.prim_func
+    def mma_sync_m16n16k16_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(a, (32, 8), "float16", align=128, offset_factor=16, scope="warp")
+        B = T.match_buffer(b, (32, 8), "float16", align=128, offset_factor=16, scope="warp")
+        C = T.match_buffer(c, (32, 8), "float16", align=128, offset_factor=16, scope="warp")
+
+        with T.block("root"):
+            T.reads(C[0:32, 0:8], A[0:32, 0:8], B[0:32, 0:8])
+            T.writes(C[0:32, 0:8])
+            for i, j, k in T.grid(16, 16, 16):
+                with T.block("C"):
+                    i, j, k = T.axis.remap("SSR", [i, j, k])
+                    thread_id_C, local_id_C = shared_16x16_to_ldmatrix_32x8_layout(i, j)
+                    thread_id_A, local_id_A = shared_16x16_to_ldmatrix_32x8_layout(i, k)
+                    thread_id_B, local_id_B = shared_16x16_to_ldmatrix_32x8_layout(k, j)
+
+                    T.reads(
+                        C[thread_id_C, local_id_C],
+                        A[thread_id_A, local_id_A],
+                        B[thread_id_B, local_id_B],
+                    )
+                    T.writes(C[thread_id_C, local_id_C])
+
+                    C[thread_id_C, local_id_C] += (
+                        A[thread_id_A, local_id_A] * B[thread_id_B, local_id_B]
+                    )
+
+    @T.prim_func
+    def mma_sync_m16n16k16_desc_manual(a: T.handle, b: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(a, (32, 8), "float16", align=128, offset_factor=16, scope="warp")
+        B = T.match_buffer(b, (32, 8), "float16", align=128, offset_factor=16, scope="warp")
+        C = T.match_buffer(c, (32, 8), "float16", align=128, offset_factor=16, scope="warp")
+
+        with T.block("root"):
+            T.reads(C[0:32, 0:8], A[0:32, 0:8], B[0:32, 0:8])
+            T.writes(C[0:32, 0:8])
+            for i, j, k in T.grid(16, 16, 16):
+                with T.block("C"):
+                    i, j, k = T.axis.remap("SSR", [i, j, k])
+                    T.reads(
+                        C[i % 8 * 4 + j % 8 // 2, j // 8 * 4 + i // 8 * 2 + j % 2],
+                        A[i % 8 * 4 + k % 8 // 2, k // 8 * 4 + i // 8 * 2 + k % 2],
+                        B[k % 8 * 4 + j % 8 // 2, j // 8 * 4 + k // 8 * 2 + j % 2],
+                    )
+                    T.writes(C[i % 8 * 4 + j % 8 // 2, j // 8 * 4 + i // 8 * 2 + j % 2])
+                    C[i % 8 * 4 + j % 8 // 2, j // 8 * 4 + i // 8 * 2 + j % 2] = (
+                        C[i % 8 * 4 + j % 8 // 2, j // 8 * 4 + i // 8 * 2 + j % 2]
+                        + A[i % 8 * 4 + k % 8 // 2, k // 8 * 4 + i // 8 * 2 + k % 2]
+                        * B[k % 8 * 4 + j % 8 // 2, j // 8 * 4 + k // 8 * 2 + j % 2]
+                    )
+
+    assert_structural_equal(mma_sync_m16n16k16_desc, mma_sync_m16n16k16_desc_manual)
+
+    # The following is an example of an error message from calling an invalid function
+
+    # error: Error occured when invoking the function sqrt:
+    # loop of ufunc does not support argument 0 of type Var which has no callable sqrt method
+    #  --> test_tvmscript_syntax_sugar.py:334:19
+    #      |
+    #  334 |              ind = sqrt(i)
+    #      |                    ^^^^^^^
+    # note: run with `TVM_BACKTRACE=1` environment variable to display a backtrace.
+
+    # Uncomment to see the error above.
+    # def sqrt(x):
+    #     import numpy as np
+    #     return np.sqrt(x)
+
+    # @T.prim_func
+    # def loop(a: T.handle) -> None:
+    #     A = T.match_buffer(a, (128,))
+    #     for i in T.serial(128):
+    #         ind = sqrt(i)
+    #         A[i] = A[ind]
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From dd986fd989cf002ba7c2665867b4212cbebf26dc Mon Sep 17 00:00:00 2001
From: Ziqang XU <xuzq1@shukun.net>
Date: Wed, 18 May 2022 18:56:41 +0800
Subject: [PATCH 14/59] [Runtime]Considering DLTensor's byte_offset in ZeroCopy
 function (#11340)

---
 src/runtime/graph_executor/graph_executor.cc | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc
index f713671317b8d..8ae98d930f139 100644
--- a/src/runtime/graph_executor/graph_executor.cc
+++ b/src/runtime/graph_executor/graph_executor.cc
@@ -165,7 +165,9 @@ void GraphExecutor::CheckExternalDLTensor(const DLTensor* external, uint32_t eid
   const DLTensor* internal = data_entry_[eid].operator->();
 
   ICHECK_EQ(data_alignment_[eid], details::GetDataAlignment(*external));
-  ICHECK_EQ(reinterpret_cast<size_t>(external->data) % kAllocAlignment, 0);
+  ICHECK_EQ(reinterpret_cast<size_t>(static_cast<char*>(external->data) + external->byte_offset) %
+                kAllocAlignment,
+            0);
   ICHECK_EQ(internal->ndim, static_cast<size_t>(external->ndim));
   ICHECK_EQ(internal->device.device_type, external->device.device_type);
   ICHECK_EQ(internal->device.device_id, external->device.device_id);
@@ -185,7 +187,7 @@ void GraphExecutor::SetInputZeroCopy(int index, DLTensor* data_ref) {
   CheckExternalDLTensor(data_ref, eid);
   // Update the data pointer for each argument of each op
   for (DLTensor* t : input_dltensors_[eid]) {
-    t->data = data_ref->data;
+    t->data = static_cast<char*>(data_ref->data) + data_ref->byte_offset;
   }
 }
 /*!
@@ -204,12 +206,12 @@ void GraphExecutor::SetOutputZeroCopy(int index, DLTensor* data_ref) {
 
   // Update the data pointer for output op
   for (DLTensor* t : output_dltensors_[output_node_eid]) {
-    t->data = data_ref->data;
+    t->data = static_cast<char*>(data_ref->data) + data_ref->byte_offset;
   }
 
   // Update the input of the op connected to the output
   for (DLTensor* t : both_output_opinput_dltensors_[output_node_eid]) {
-    t->data = data_ref->data;
+    t->data = static_cast<char*>(data_ref->data) + data_ref->byte_offset;
   }
 }
 /*!

From 7f1c54f96ae4099c178f45402f3c156a565dedce Mon Sep 17 00:00:00 2001
From: Andrey Malyshev <elvin.nnov@gmail.com>
Date: Wed, 18 May 2022 14:00:07 +0300
Subject: [PATCH 15/59] Fix eltwise alter op layout for broadcast axis (#11337)

* Fix eltwise alter op layout for broadcast axis

* Add tests on boradcast blocking over already blocked layout
---
 src/relay/transforms/infer_layout_utils.cc    |   3 +-
 .../python/relay/test_pass_alter_op_layout.py | 200 ++++++++++++++++++
 2 files changed, 202 insertions(+), 1 deletion(-)

diff --git a/src/relay/transforms/infer_layout_utils.cc b/src/relay/transforms/infer_layout_utils.cc
index 32838e09a4410..efe886c29d23b 100644
--- a/src/relay/transforms/infer_layout_utils.cc
+++ b/src/relay/transforms/infer_layout_utils.cc
@@ -64,7 +64,8 @@ Layout AdjustSubordinateFactors(const Layout& src_layout, const Layout& old_layo
 
         // 4) a) Check if this shape element is 1.
         if (auto* shape_int = shape_val.as<IntImmNode>()) {
-          if (shape_int->value == 1) {
+          // We can treat 1 as broadcast only if axis was not split before
+          if (shape_int->value == 1 && old_layout.IndexOf(LayoutAxis::Get(axis)) == -1) {
             new_layout += "1";
             is_shape_one = true;
           }
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
index cffc33b0bc249..5aff77ad36f56 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -1602,6 +1602,206 @@ def alter_conv2d(attrs, inputs, tinfos, out_type):
     np.testing.assert_allclose(res.numpy(), res1.numpy())
 
 
+def test_alter_layout_blocked_no_broadcast():
+    """Test boradcast operators working on already blocked layout"""
+
+    def before():
+        dtype = "float32"
+        input_shape = (1, 8, 16, 16, 4)
+        filter_shape = (1, 8, 4, 4, 4, 4)
+        bias_shape = (1, 1, 1, 1, 4)
+        A = relay.var("data", shape=input_shape, dtype=dtype)
+        B = relay.var("weight", shape=filter_shape, dtype=dtype)
+        C = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+        conv = relay.nn.conv2d(
+            A,
+            B,
+            data_layout="NCHW4c",
+            kernel_layout="OIHW4i4o",
+            padding=[3, 3, 0, 0],
+            strides=[2, 2],
+            out_dtype=dtype,
+            channels=4,
+            kernel_size=(4, 4),
+        )
+        bias = relay.op.add(conv, C)
+        bias = relay.Function(analysis.free_vars(bias), bias)
+        return bias
+
+    def expected():
+        return before()
+
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
+        data, weight = inputs
+        new_attrs = dict(attrs)
+        new_attrs["data_layout"] = "NCHW4c"
+        new_attrs["kernel_layout"] = "OIHW4i4o"
+        return relay.nn.conv2d(data, weight, **new_attrs)
+
+    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
+        a = run_opt_pass(before(), transform.AlterOpLayout())
+        b = run_opt_pass(expected(), transform.InferType())
+        assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "\nExpected = \n" + str(b)
+
+    inp = np.random.uniform(size=(1, 8, 16, 16, 4)).astype(np.float32)
+    weight = np.random.uniform(size=(1, 8, 4, 4, 4, 4)).astype(np.float32)
+    z = np.random.uniform(size=(1, 1, 1, 1, 4)).astype(np.float32)
+    mod = tvm.IRModule.from_expr(before())
+    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
+        with tvm.transform.PassContext(opt_level=4):
+            res = relay.build_module.create_executor(
+                "graph", mod, target="llvm", device=tvm.cpu()
+            ).evaluate()(inp, weight, z)
+    with tvm.transform.PassContext(opt_level=0):
+        res1 = relay.build_module.create_executor(
+            "debug", mod, target="llvm", device=tvm.cpu()
+        ).evaluate()(inp, weight, z)
+    np.testing.assert_allclose(res.numpy(), res1.numpy())
+
+
+def test_alter_layout_blocked_broadcast():
+    """Test boradcast operators working on already blocked layout"""
+
+    def before():
+        dtype = "float32"
+        input_shape = (1, 8, 16, 16, 4)
+        filter_shape = (1, 8, 4, 4, 4, 4)
+        bias_shape = (1, 1, 1, 1, 1)
+        A = relay.var("data", shape=input_shape, dtype=dtype)
+        B = relay.var("weight", shape=filter_shape, dtype=dtype)
+        C = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+        conv = relay.nn.conv2d(
+            A,
+            B,
+            data_layout="NCHW4c",
+            kernel_layout="OIHW4i4o",
+            padding=[3, 3, 0, 0],
+            strides=[2, 2],
+            out_dtype=dtype,
+            channels=4,
+            kernel_size=(4, 4),
+        )
+        bias = relay.op.add(conv, C)
+        bias = relay.Function(analysis.free_vars(bias), bias)
+        return bias
+
+    def expected():
+        return before()
+
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
+        data, weight = inputs
+        new_attrs = dict(attrs)
+        new_attrs["data_layout"] = "NCHW4c"
+        new_attrs["kernel_layout"] = "OIHW4i4o"
+        return relay.nn.conv2d(data, weight, **new_attrs)
+
+    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
+        a = run_opt_pass(before(), transform.AlterOpLayout())
+        b = run_opt_pass(expected(), transform.InferType())
+        assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "\nExpected = \n" + str(b)
+
+    inp = np.random.uniform(size=(1, 8, 16, 16, 4)).astype(np.float32)
+    weight = np.random.uniform(size=(1, 8, 4, 4, 4, 4)).astype(np.float32)
+    z = np.random.uniform(size=(1, 1, 1, 1, 1)).astype(np.float32)
+    mod = tvm.IRModule.from_expr(before())
+    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
+        with tvm.transform.PassContext(opt_level=4):
+            res = relay.build_module.create_executor(
+                "graph", mod, target="llvm", device=tvm.cpu()
+            ).evaluate()(inp, weight, z)
+    with tvm.transform.PassContext(opt_level=0):
+        res1 = relay.build_module.create_executor(
+            "debug", mod, target="llvm", device=tvm.cpu()
+        ).evaluate()(inp, weight, z)
+    np.testing.assert_allclose(res.numpy(), res1.numpy())
+
+
+def test_alter_layout_re_blocking_broadcast():
+    """Test of re-blocking shapes with boradcast operators"""
+
+    def before():
+        dtype = "float32"
+        input_shape = (1, 8, 16, 16, 4)
+        filter_shape = (1, 8, 4, 4, 4, 4)
+        bias_shape = (1, 1, 1, 1, 4)
+        A = relay.var("data", shape=input_shape, dtype=dtype)
+        B = relay.var("weight", shape=filter_shape, dtype=dtype)
+        C = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+        conv = relay.nn.conv2d(
+            A,
+            B,
+            data_layout="NCHW4c",
+            kernel_layout="OIHW4i4o",
+            padding=[3, 3, 0, 0],
+            strides=[2, 2],
+            out_dtype=dtype,
+            channels=4,
+            kernel_size=(4, 4),
+        )
+        bias = relay.op.add(conv, C)
+        bias = relay.Function(analysis.free_vars(bias), bias)
+        return bias
+
+    def expected():
+        dtype = "float32"
+        input_shape = (1, 8, 16, 16, 4)
+        filter_shape = (1, 8, 4, 4, 4, 4)
+        bias_shape = (1, 1, 1, 1, 4)
+        A = relay.var("data", shape=input_shape, dtype=dtype)
+        B = relay.var("weight", shape=filter_shape, dtype=dtype)
+        C = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+        A = relay.layout_transform(A, src_layout="NCHW4c", dst_layout="NCHW2c")
+        B = relay.layout_transform(B, src_layout="OIHW4i4o", dst_layout="OIHW2i2o")
+
+        conv = relay.nn.conv2d(
+            A,
+            B,
+            data_layout="NCHW2c",
+            kernel_layout="OIHW2i2o",
+            padding=[3, 3, 0, 0],
+            strides=[2, 2],
+            out_dtype=dtype,
+            channels=4,
+            kernel_size=(4, 4),
+        )
+        C = relay.layout_transform(C, src_layout="NCHW4c", dst_layout="NCHW2c")
+        bias = relay.op.add(conv, C)
+        bias = relay.layout_transform(bias, src_layout="NCHW2c", dst_layout="NCHW4c")
+        bias = relay.Function(analysis.free_vars(bias), bias)
+        return bias
+
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
+        data, weight = inputs
+        new_attrs = dict(attrs)
+        new_attrs["data_layout"] = "NCHW2c"
+        new_attrs["kernel_layout"] = "OIHW2i2o"
+        return relay.nn.conv2d(data, weight, **new_attrs)
+
+    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
+        a = run_opt_pass(before(), transform.AlterOpLayout())
+        b = run_opt_pass(expected(), transform.InferType())
+        assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "\nExpected = \n" + str(b)
+
+    inp = np.random.uniform(size=(1, 8, 16, 16, 4)).astype(np.float32)
+    weight = np.random.uniform(size=(1, 8, 4, 4, 4, 4)).astype(np.float32)
+    z = np.random.uniform(size=(1, 1, 1, 1, 4)).astype(np.float32)
+    mod = tvm.IRModule.from_expr(before())
+    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
+        with tvm.transform.PassContext(opt_level=4):
+            res = relay.build_module.create_executor(
+                "graph", mod, target="llvm", device=tvm.cpu()
+            ).evaluate()(inp, weight, z)
+    with tvm.transform.PassContext(opt_level=0):
+        res1 = relay.build_module.create_executor(
+            "debug", mod, target="llvm", device=tvm.cpu()
+        ).evaluate()(inp, weight, z)
+    np.testing.assert_allclose(res.numpy(), res1.numpy(), rtol=1e-5, atol=1e-5)
+
+
 def test_broadcast_non_adaptable():
     """NCHW4c + [x, x, 4] and NCHW4c is being altered to NCHW"""
 

From 99caa6533fde8e7264e6659575c03e5ecf54cd6b Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Wed, 18 May 2022 12:17:47 +0100
Subject: [PATCH 16/59] [TVMC][ETHOSN] Improve target string to avoid
 duplication (#11272)

* [TVMC][ETHOSN] Improve target string to avoid duplication

Improves the TVMC target string to avoid duplication of the
NPU variant. The new target string will require the just the NPU
name followed by -variant=n78. The old target string is deprecated
and will be removed in a subsequent version of TVM.

Change-Id: I4638f36788df3f478435ac13d3531aad2b23f204

* fix linting

Change-Id: I76a9da511899f24a163be669877605cd1a440022

* fix make variant functions and update test error message

Change-Id: Iff553d4b255c0ce0b86bad42eaa94ee9b1c62508
---
 python/tvm/driver/tvmc/composite_target.py     | 18 +++++++++++++++---
 python/tvm/relay/op/contrib/ethosn.py          | 18 +++++++++++++++---
 src/relay/backend/contrib/ethosn/codegen.cc    | 11 ++++++++---
 .../backend/contrib/ethosn/codegen_ethosn.h    |  2 +-
 .../test_ethosn/test_partition_params.py       | 14 +++++++-------
 tests/python/driver/tvmc/test_compiler.py      |  4 +---
 .../driver/tvmc/test_composite_target.py       |  2 +-
 tests/python/driver/tvmc/test_target.py        |  4 ++--
 8 files changed, 50 insertions(+), 23 deletions(-)

diff --git a/python/tvm/driver/tvmc/composite_target.py b/python/tvm/driver/tvmc/composite_target.py
index de743799f01c4..88bea9980014e 100644
--- a/python/tvm/driver/tvmc/composite_target.py
+++ b/python/tvm/driver/tvmc/composite_target.py
@@ -18,12 +18,13 @@
 Provides support to composite target on TVMC.
 """
 import logging
+import warnings
 
 # Make sure Vitis AI codegen is registered
 import tvm.contrib.target.vitis_ai  # pylint: disable=unused-import
 
 from tvm.relay.op.contrib.arm_compute_lib import partition_for_arm_compute_lib
-from tvm.relay.op.contrib.ethosn import partition_for_ethosn78
+from tvm.relay.op.contrib.ethosn import partition_for_ethosn
 from tvm.relay.op.contrib.cmsisnn import partition_for_cmsisnn
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 from tvm.relay.op.contrib.bnns import partition_for_bnns
@@ -55,9 +56,9 @@
         "config_key": "relay.ext.cmsisnn.options",
         "pass_pipeline": partition_for_cmsisnn,
     },
-    "ethos-n78": {
+    "ethos-n": {
         "config_key": "relay.ext.ethos-n.options",
-        "pass_pipeline": partition_for_ethosn78,
+        "pass_pipeline": partition_for_ethosn,
     },
     "ethos-u": {
         "config_key": "relay.ext.ethos-u.options",
@@ -71,6 +72,11 @@
         "config_key": "relay.ext.vitis_ai.options",
         "pass_pipeline": partition_for_vitis_ai,
     },
+    # Deprecated in favour of "ethos-n".
+    "ethos-n78": {
+        "config_key": "relay.ext.ethos-n.options",
+        "pass_pipeline": partition_for_ethosn,
+    },
 }
 
 
@@ -99,6 +105,12 @@ def get_codegen_by_target(name):
         requested target codegen information
     """
     try:
+        if name == "ethos-n78":
+            warnings.warn(
+                "Please use 'ethos-n' instead of the deprecated 'ethos-n78' target, "
+                "which will be removed in a later release of TVM.",
+                DeprecationWarning,
+            )
         return REGISTERED_CODEGEN[name]
     except KeyError:
         raise TVMCException("Composite target %s is not defined in TVMC." % name)
diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index a1a3e2dccc4cc..17038e749f8e2 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name, unused-argument
 """Arm(R) Ethos(TM)-N NPU supported operators."""
 from enum import Enum
+import warnings
 
 import tvm.ir
 from tvm.relay import transform
@@ -46,7 +47,7 @@ def ethosn_available():
     return Available.SW_AND_HW if hw else Available.SW_ONLY
 
 
-def partition_for_ethosn78(mod, params=None, **opts):
+def partition_for_ethosn(mod, params=None, **opts):
     """Partition the graph greedily offloading supported
     operators to Arm Ethos-N NPU.
 
@@ -61,8 +62,19 @@ def partition_for_ethosn78(mod, params=None, **opts):
     -------
     ret : annotated and partitioned module.
     """
-    if not opts or opts.get("variant", "").lower() != "ethos-n78":
-        raise ValueError("When targeting Ethos(TM)-N78, -variant=Ethos-N78 should be set.")
+    opts = opts or {}
+    if "variant" not in opts:
+        raise ValueError("Please specify a variant in the target string, e.g. -variant=n78.")
+
+    # -variant=ethos-n78 deprecated in favour of -variant=n78
+    if opts["variant"].lower() == "ethos-n78":
+        warnings.warn(
+            "Please use '-variant=n78' instead of the deprecated "
+            "'-variant=ethos-n78', which will be removed in TVM v0.9.",
+            DeprecationWarning,
+        )
+    elif opts["variant"] != "n78":
+        raise ValueError("When targeting Ethos(TM)-N78, -variant=n78 should be set.")
 
     if params:
         mod["main"] = bind_params_by_name(mod["main"], params)
diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc
index d9f7b84b2f764..fc8a4c48dfefa 100644
--- a/src/relay/backend/contrib/ethosn/codegen.cc
+++ b/src/relay/backend/contrib/ethosn/codegen.cc
@@ -213,9 +213,14 @@ String MakeVariant(Optional<EthosnCompilerConfig> configuration) {
   String variant = configuration.value()->variant;
   // Transform variant string to lowercase for comparison
   std::string variant_string = variant.c_str();
-  std::transform(variant_string.begin(), variant_string.end(), variant_string.begin(), ::tolower);
-  std::string variant_n78 = "ethos-n78";
-  if (variant_string == variant_n78) {
+
+  // Checking deprecated variant format. Support for specifying
+  // the variant in this way only remains for backwards compatibility
+  // and will be removed in a later release of TVM.
+  std::string deprecated_variant_string = variant_string;
+  std::transform(deprecated_variant_string.begin(), deprecated_variant_string.end(),
+                 deprecated_variant_string.begin(), ::tolower);
+  if (variant_string == "n78" || deprecated_variant_string == "ethos-n78") {
     String tops = configuration.value()->tops;
     String ple_ratio = configuration.value()->ple_ratio;
     variant = "Ethos-N78_" + tops + "TOPS_" + ple_ratio + "PLE_RATIO";
diff --git a/src/relay/backend/contrib/ethosn/codegen_ethosn.h b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
index cca96c044c848..9da4e5b18bd5d 100644
--- a/src/relay/backend/contrib/ethosn/codegen_ethosn.h
+++ b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
@@ -251,7 +251,7 @@ struct EthosnCompilerConfigNode : public tvm::AttrsNode<EthosnCompilerConfigNode
   String compiler_algorithm;
 
   TVM_DECLARE_ATTRS(EthosnCompilerConfigNode, "ext.attrs.EthosnCompilerConfigNode") {
-    TVM_ATTR_FIELD(variant).describe("See Ethos-N documentation.").set_default("Ethos-N78");
+    TVM_ATTR_FIELD(variant).describe("See Ethos-N documentation.").set_default("n78");
     TVM_ATTR_FIELD(sram_size)
         .describe("Optionally override the default sram size. See Ethos(TM)-N documentation.")
         .set_default("0");
diff --git a/tests/python/contrib/test_ethosn/test_partition_params.py b/tests/python/contrib/test_ethosn/test_partition_params.py
index 174bdd9416a4c..34e22e6aaba8e 100644
--- a/tests/python/contrib/test_ethosn/test_partition_params.py
+++ b/tests/python/contrib/test_ethosn/test_partition_params.py
@@ -22,7 +22,7 @@
 from tvm import relay
 import numpy as np
 
-from tvm.relay.op.contrib.ethosn import partition_for_ethosn78
+from tvm.relay.op.contrib.ethosn import partition_for_ethosn
 from tvm.testing import requires_ethosn
 
 
@@ -35,14 +35,14 @@ def test_ethosn78_partition_no_error():
     res = relay.nn.bias_add(res, b, axis=1)
 
     mod = tvm.IRModule.from_expr(res)
-    opts = {"variant": "Ethos-N78"}
-    partition_for_ethosn78(mod, **opts)
+    opts = {"variant": "n78"}
+    partition_for_ethosn(mod, **opts)
 
 
 @requires_ethosn
 def test_ethosn78_partition_undefined_variant():
     with pytest.raises(
-        ValueError, match=r".*When targeting Ethos\(TM\)-N78, -variant=Ethos-N78 should be set.*"
+        ValueError, match=r".*Please specify a variant in the target string, e.g. -variant=n78.*"
     ):
         a = relay.var("a", shape=[2, 7, 8, 8], dtype="uint8")
         w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8"))
@@ -53,13 +53,13 @@ def test_ethosn78_partition_undefined_variant():
         res = relay.nn.bias_add(res, b, axis=1)
 
         mod = tvm.IRModule.from_expr(res)
-        partition_for_ethosn78(mod)
+        partition_for_ethosn(mod)
 
 
 @requires_ethosn
 def test_ethosn78_partition_invalid_variant():
     with pytest.raises(
-        ValueError, match=r".*When targeting Ethos\(TM\)-N78, -variant=Ethos-N78 should be set.*"
+        ValueError, match=r".*When targeting Ethos\(TM\)-N78, -variant=n78 should be set.*"
     ):
         a = relay.var("a", shape=[2, 7, 8, 8], dtype="uint8")
         w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8"))
@@ -71,4 +71,4 @@ def test_ethosn78_partition_invalid_variant():
 
         mod = tvm.IRModule.from_expr(res)
         opts = {"variant": "Ethos-N"}
-        partition_for_ethosn78(mod, **opts)
+        partition_for_ethosn(mod, **opts)
diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
index 2acb179735155..bfbf9922e0a3a 100644
--- a/tests/python/driver/tvmc/test_compiler.py
+++ b/tests/python/driver/tvmc/test_compiler.py
@@ -419,9 +419,7 @@ def test_compile_tflite_module_with_external_codegen_cmsisnn(
 def test_compile_tflite_module_with_external_codegen_ethos_n78(tflite_mobilenet_v1_1_quant):
     pytest.importorskip("tflite")
     tvmc_model = tvmc.load(tflite_mobilenet_v1_1_quant)
-    tvmc_package = tvmc.compile(
-        tvmc_model, target="ethos-n78 -variant=ethos-n78, llvm", dump_code="relay"
-    )
+    tvmc_package = tvmc.compile(tvmc_model, target="ethos-n -variant=n78, llvm", dump_code="relay")
     dumps_path = tvmc_package.package_path + ".relay"
 
     # check for output types
diff --git a/tests/python/driver/tvmc/test_composite_target.py b/tests/python/driver/tvmc/test_composite_target.py
index d0893af7c1c1b..ca08d3e66fa77 100644
--- a/tests/python/driver/tvmc/test_composite_target.py
+++ b/tests/python/driver/tvmc/test_composite_target.py
@@ -33,7 +33,7 @@
 def test_get_codegen_names():
     names = tvmc.composite_target.get_codegen_names()
 
-    assert "ethos-n78" in names
+    assert "ethos-n" in names
     assert "vitis-ai" in names
     assert len(names) > 0
 
diff --git a/tests/python/driver/tvmc/test_target.py b/tests/python/driver/tvmc/test_target.py
index eb3ffdea42b36..b842618efccd3 100644
--- a/tests/python/driver/tvmc/test_target.py
+++ b/tests/python/driver/tvmc/test_target.py
@@ -153,10 +153,10 @@ def test_parse_quotes_and_separators_on_options():
 
 
 def test_parse_multiple_target_with_opts_ethos_n78():
-    targets = parse_target("ethos-n78 -myopt=value, llvm -device=arm_cpu --system-lib")
+    targets = parse_target("ethos-n -myopt=value, llvm -device=arm_cpu --system-lib")
 
     assert len(targets) == 2
-    assert "ethos-n78" == targets[0]["name"]
+    assert "ethos-n" == targets[0]["name"]
     assert "myopt" in targets[0]["opts"]
     assert "value" == targets[0]["opts"]["myopt"]
     assert "llvm" == targets[1]["name"]

From 1b32245f0bb4a76ff10b34c37e01413bda6a4021 Mon Sep 17 00:00:00 2001
From: Nicola Lancellotti <nicola.lancellotti@arm.com>
Date: Wed, 18 May 2022 14:22:30 +0200
Subject: [PATCH 17/59] [microNPU] Add a pass to reorder copy and compute nodes
 (#10959)

---
 .../backend/contrib/ethosu/tir/compiler.py    |   1 +
 .../backend/contrib/ethosu/tir/passes.py      |  25 +
 src/tir/contrib/ethosu/passes.cc              | 108 ++++
 .../cascader/test_memory_reduction.py         |  16 +-
 .../test_copy_compute_reordering.py           | 472 ++++++++++++++++++
 .../test_ethosu/test_encode_constants.py      | 247 +++++----
 .../contrib/test_ethosu/test_networks.py      |  18 +-
 .../contrib/test_ethosu/test_replace_copy.py  |   6 +-
 .../contrib/test_ethosu/test_scheduler.py     |  43 +-
 9 files changed, 768 insertions(+), 168 deletions(-)
 create mode 100644 tests/python/contrib/test_ethosu/test_copy_compute_reordering.py

diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
index f2c294cfed1a5..db216e43e2d1a 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
@@ -90,6 +90,7 @@ def lower_ethosu(sch, args, const_dict, name="main"):
         mod = tvm.tir.transform.RemoveNoOp()(mod)
         mod, const_dict = ethosu_passes.EncodeConstants(const_dict)(mod)
         mod = ethosu_passes.HoistAllocates()(mod)
+        mod = ethosu_passes.CopyComputeReordering()(mod)
         disable_storage_rewrite = curr_cfg.get("tir.disable_storage_rewrite", False)
         if not disable_storage_rewrite:
             mod = tvm.tir.transform.StorageRewrite()(mod)
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
index baadede08d668..76726132e05de 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name, unused-argument, no-else-return, inconsistent-return-statements, too-many-nested-blocks
 """The TIR passes to be run on Arm(R) Ethos(TM)-U NPU TIR Compiler."""
 from collections import namedtuple
+from typing import Optional
 import numpy as np  # type: ignore
 
 import tvm
@@ -913,3 +914,27 @@ def HoistAllocates() -> tvm.IRModule:
         The new module with hoisted allocate nodes.
     """
     return _ffi_api.HoistAllocates()
+
+
+def CopyComputeReordering(max_copy_movements: Optional[int] = None) -> tvm.IRModule:
+    """
+    Reorders copy and compute nodes in such a way that independent DMA copies,
+    and computes happen in parallel.
+    Copies to buffers with local scope are not reordered, indeed they copy LUT
+    into the SHRAM which already happens in parallel with copying weights into
+    the weights encoder.
+
+    Parameters
+    ----------
+    max_copy_movements: Optional[int]
+        The maximum number of movements allowed for a copy.
+        If None, the pass context option
+        tir.contrib.ethos-u.copy_compute_reordering_max_copy_movements
+        is used if provided, otherwise the default value will be 1.
+
+    Returns
+    -------
+    tvm.IRModule
+        The new module with copy and compute nodes reordered.
+    """
+    return _ffi_api.CopyComputeReordering(max_copy_movements)
diff --git a/src/tir/contrib/ethosu/passes.cc b/src/tir/contrib/ethosu/passes.cc
index 45161499f5be4..2b7b2b4741e67 100644
--- a/src/tir/contrib/ethosu/passes.cc
+++ b/src/tir/contrib/ethosu/passes.cc
@@ -27,7 +27,17 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
+#include <algorithm>
+
 namespace tvm {
+
+/*!
+ * \brief The maximum number of movements allowed for a copy in the CopyComputeReordering pass.
+ */
+constexpr const char* kCopyComputeReorderingMaxCopyMovements =
+    "tir.contrib.ethos-u.copy_compute_reordering_max_copy_movements";
+TVM_REGISTER_PASS_CONFIG_OPTION(kCopyComputeReorderingMaxCopyMovements, Integer);
+
 namespace tir {
 namespace contrib {
 namespace ethosu {
@@ -110,6 +120,104 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ * Copies to buffers with local scope are not reordered, indeed they copy LUT
+ * into the SHRAM which already happens in parallel with copying weights into
+ * the weights encoder.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  explicit CopyComputeReorderingMutator(int max_copy_movements)
+      : _max_copy_movements{max_copy_movements} {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    if (_max_copy_movements > 0) {
+      auto prim_func_node{main_func.CopyOnWrite()};
+      prim_func_node->body = this->VisitStmt(main_func->body);
+      return GetRef<PrimFunc>(prim_func_node);
+    }
+    return main_func;
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+
+    // Each copy statement to a buffer with global scope is moved up
+    // at most `_max_copy_movements` times.
+    for (size_t index = 0; index < new_seq.size(); ++index) {
+      if (stmt_is_global_copy(new_seq[index])) {
+        int lower = std::max(0, static_cast<int>(index) - _max_copy_movements);
+        for (int i = index; i > lower && !stmt_is_copy(new_seq[i - 1]); --i) {
+          std::swap(new_seq[i - 1], new_seq[i]);
+        }
+      }
+    }
+
+    auto seq_stmt_node{CopyOnWrite(op)};
+    seq_stmt_node->seq = std::move(new_seq);
+    return Stmt{seq_stmt_node};
+  }
+
+  tvm::runtime::Array<tvm::PrimExpr> get_stmt_args(const Stmt& stmt) {
+    auto eval_node{stmt.as<EvaluateNode>()};
+    ICHECK(eval_node) << "Expected statement to be an evaluate node, but was "
+                      << stmt->GetTypeKey();
+    auto call_node{eval_node->value.as<CallNode>()};
+    ICHECK(call_node) << "Expected expression to be a call node, but was "
+                      << eval_node->value->GetTypeKey();
+    return call_node->args;
+  }
+
+  bool stmt_is_copy(const Stmt& stmt) {
+    auto args{get_stmt_args(stmt)};
+    return args[0].as<StringImmNode>()->value == "ethosu_copy";
+  }
+
+  bool stmt_is_global_copy(const Stmt& stmt) {
+    auto args{get_stmt_args(stmt)};
+    return args[0].as<StringImmNode>()->value == "ethosu_copy" &&
+           args[3].as<BufferLoadNode>()->buffer.scope() == "global";
+  }
+
+  /*! The maximum number of movements allowed for a copy. */
+  int _max_copy_movements;
+};
+
+/*!
+ * \brief A pass to reorder copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ *
+ * \param max_copy_movements: The maximum number of movements allowed for a copy.
+ *  If None, the pass context option tir.contrib.ethos-u.copy_compute_reordering_max_copy_movements
+ *  is used if provided, otherwise the default value will be 1.
+ * \return tvm::transform::Pass
+ */
+tvm::transform::Pass CopyComputeReordering(Optional<Integer> max_copy_movements) {
+  auto pass_func = [=](PrimFunc f, IRModule mod, tvm::transform::PassContext ctx) {
+    ICHECK(mod->GetGlobalVars().size() == 1 && mod->ContainGlobalVar("main"))
+        << "Expected a single primitive function called 'main'. Please run the "
+           "CopyComputeReordering "
+           "pass in conjunction with the LowerToTIR() pass.";
+    auto value = max_copy_movements.value_or(
+        ctx->GetConfig(kCopyComputeReorderingMaxCopyMovements, Integer(1)).value());
+    return CopyComputeReorderingMutator(value)(f);
+  };
+  return tvm::tir::transform::CreatePrimFuncPass(pass_func, 0,
+                                                 "tir.contrib.ethos-u.CopyComputeReordering", {});
+}
+
+TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.CopyComputeReordering")
+    .set_body_typed(CopyComputeReordering);
+
 }  // namespace ethosu
 }  // namespace contrib
 }  // namespace tir
diff --git a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
index 5e4117e50f8e1..01545217beb48 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
@@ -91,10 +91,10 @@ def _get_ethosu_workspace_size(
 @pytest.mark.parametrize(
     "accel_type, expected_ws_size_without_striping, expected_ws_size_with_striping",
     [
-        ("ethos-u55-256", 1067408, 14096),
-        ("ethos-u55-128", 1067408, 3968),
-        ("ethos-u55-64", 1067408, 3968),
-        ("ethos-u55-32", 1067392, 3952),
+        ("ethos-u55-256", 1067520, 14208),
+        ("ethos-u55-128", 1067520, 4080),
+        ("ethos-u55-64", 1067520, 4080),
+        ("ethos-u55-32", 1067504, 4064),
     ],
 )
 def test_double_conv2d(
@@ -161,10 +161,10 @@ def tf_graph(x):
 @pytest.mark.parametrize(
     "accel_type, expected_ws_size_without_striping, expected_ws_size_with_striping",
     [
-        ("ethos-u55-256", 180096, 15008),
-        ("ethos-u55-128", 180096, 14240),
-        ("ethos-u55-64", 180096, 14240),
-        ("ethos-u55-32", 180096, 14240),
+        ("ethos-u55-256", 180288, 15200),
+        ("ethos-u55-128", 180288, 14432),
+        ("ethos-u55-64", 180288, 14432),
+        ("ethos-u55-32", 180272, 14416),
     ],
 )
 def test_depthwise2d_conv2d_pooling(
diff --git a/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py b/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py
new file mode 100644
index 0000000000000..eebaa3b816b42
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py
@@ -0,0 +1,472 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+pytest.importorskip("ethosu.vela")
+
+import tvm
+from tvm.script import tir as T
+from tvm.relay.backend.contrib.ethosu.tir.passes import CopyComputeReordering
+
+# fmt: off
+@tvm.script.ir_module
+class AllOperatorsWithWeights:
+    @T.prim_func
+    def main() -> None:
+        # function attr dict
+        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+        buffer1 = T.buffer_decl([8192], "int8")
+        buffer2 = T.buffer_decl([128], "uint8")
+        buffer3 = T.buffer_decl([32], "uint8")
+        buffer4 = T.buffer_decl([112], "uint8")
+        buffer5 = T.buffer_decl([32], "uint8")
+        buffer6 = T.buffer_decl([112], "uint8")
+        buffer7 = T.buffer_decl([32], "uint8")
+        buffer8 = T.buffer_decl([112], "uint8")
+        buffer9 = T.buffer_decl([32], "uint8")
+        buffer10 = T.buffer_decl([2048], "int8")
+        # body
+        p1 = T.allocate([128], "uint8", "global")
+        p2 = T.allocate([112], "uint8", "global")
+        p3 = T.allocate([112], "uint8", "global")
+        p4 = T.allocate([32], "uint8", "global")
+        p5 = T.allocate([32], "uint8", "global")
+        p6 = T.allocate([32], "uint8", "global")
+        p7 = T.allocate([112], "uint8", "global")
+        p8 = T.allocate([32], "uint8", "global")
+        T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p5[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 112, p3[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 32, p6[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p6[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 112, p7[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer9[0], 32, p8[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p8[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+# fmt: on
+
+
+def test_all_operators_with_weights_max_copy_movements_0():
+    test_mod = CopyComputeReordering(0)(AllOperatorsWithWeights)
+    reference_mod = AllOperatorsWithWeights
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+def test_all_operators_with_weights_max_copy_movements_1():
+    # fmt: off
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main() -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            buffer1 = T.buffer_decl([8192], "int8")
+            buffer2 = T.buffer_decl([128], "uint8")
+            buffer3 = T.buffer_decl([32], "uint8")
+            buffer4 = T.buffer_decl([112], "uint8")
+            buffer5 = T.buffer_decl([32], "uint8")
+            buffer6 = T.buffer_decl([112], "uint8")
+            buffer7 = T.buffer_decl([32], "uint8")
+            buffer8 = T.buffer_decl([112], "uint8")
+            buffer9 = T.buffer_decl([32], "uint8")
+            buffer10 = T.buffer_decl([2048], "int8")
+            # body
+            p1 = T.allocate([128], "uint8", "global")
+            p2 = T.allocate([112], "uint8", "global")
+            p3 = T.allocate([112], "uint8", "global")
+            p4 = T.allocate([32], "uint8", "global")
+            p5 = T.allocate([32], "uint8", "global")
+            p6 = T.allocate([32], "uint8", "global")
+            p7 = T.allocate([112], "uint8", "global")
+            p8 = T.allocate([32], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 112, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 32, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 112, p7[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer9[0], 32, p8[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p6[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p8[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+
+    test_mod = CopyComputeReordering(1)(AllOperatorsWithWeights)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+def test_all_operators_with_weights_max_copy_movements_2():
+    # fmt: off
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main() -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            buffer1 = T.buffer_decl([8192], "int8")
+            buffer2 = T.buffer_decl([128], "uint8")
+            buffer3 = T.buffer_decl([32], "uint8")
+            buffer4 = T.buffer_decl([112], "uint8")
+            buffer5 = T.buffer_decl([32], "uint8")
+            buffer6 = T.buffer_decl([112], "uint8")
+            buffer7 = T.buffer_decl([32], "uint8")
+            buffer8 = T.buffer_decl([112], "uint8")
+            buffer9 = T.buffer_decl([32], "uint8")
+            buffer10 = T.buffer_decl([2048], "int8")
+            # body
+            p1 = T.allocate([128], "uint8", "global")
+            p2 = T.allocate([112], "uint8", "global")
+            p3 = T.allocate([112], "uint8", "global")
+            p4 = T.allocate([32], "uint8", "global")
+            p5 = T.allocate([32], "uint8", "global")
+            p6 = T.allocate([32], "uint8", "global")
+            p7 = T.allocate([112], "uint8", "global")
+            p8 = T.allocate([32], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 112, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 32, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 112, p7[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer9[0], 32, p8[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p6[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p8[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+
+    test_mod = CopyComputeReordering(2)(AllOperatorsWithWeights)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+# fmt: off
+@tvm.script.ir_module
+class AllOperatorsWithoutWeights:
+    @T.prim_func
+    def main() -> None:
+        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})  
+        buffer1 = T.buffer_decl([36], "int8")
+        buffer2 = T.buffer_decl([9], "int8")
+        # body
+        p1 = T.allocate([96], "int8", "global")
+        T.evaluate(T.call_extern("ethosu_pooling", "int8", 3, 4, 3, 3, 0, 4, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 12, 3, 1, "int8", 3, 2, 3, 3, 0, 2, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 32, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_pooling", "int8", 3, 2, 3, 3, 0, 2, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 32, 16, 1, "int8", 3, 1, 3, 3, 0, 1, buffer2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 3, 1, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+# fmt: on
+
+
+@pytest.mark.parametrize("max_copy_movements", [0, 1, 2])
+def test_all_operators_without_weights(max_copy_movements):
+    test_mod = CopyComputeReordering(max_copy_movements)(AllOperatorsWithoutWeights)
+    reference_mod = AllOperatorsWithoutWeights
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+# fmt: off
+@tvm.script.ir_module
+class OperatorsWithAndWithoutWeights:
+    @T.prim_func
+    def main() -> None:
+        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})  
+        buffer1 = T.buffer_decl([97156], "int8")
+        buffer2 = T.buffer_decl([80], "uint8")
+        buffer3 = T.buffer_decl([64], "uint8")
+        buffer4 = T.buffer_decl([96], "uint8")
+        buffer5 = T.buffer_decl([32], "uint8")
+        # body
+        p1 = T.allocate([390336], "int8", "global")
+        p2 = T.allocate([80], "uint8", "global")
+        p3 = T.allocate([64], "uint8", "global")
+        p4 = T.allocate([390336], "int8", "global")
+        p5 = T.allocate([96], "uint8", "global")
+        p6 = T.allocate([32], "uint8", "global")
+        T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p2[0], 80, 0, p3[0], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p6[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 3, 214, 0, 114, buffer3[0], 0, 0, 0, T.float32(0.104816), -128, "NHWC", 342, 3, 1, 3, 1, 1, 1, 1, 2, p5[0], 96, 0, p6[0], 32, 0, 1, 0, 1, "CLIP", -128, 127, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+# fmt: on
+
+
+def test_operators_with_and_without_weights_max_copy_movements_0():
+    test_mod = CopyComputeReordering(0)(OperatorsWithAndWithoutWeights)
+    reference_mod = OperatorsWithAndWithoutWeights
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+def test_operators_with_and_without_weights_max_copy_movements_1():
+    # fmt: off
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main() -> None:
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            buffer1 = T.buffer_decl([97156], "int8")
+            buffer2 = T.buffer_decl([80], "uint8")
+            buffer3 = T.buffer_decl([64], "uint8")
+            buffer4 = T.buffer_decl([96], "uint8")
+            buffer5 = T.buffer_decl([32], "uint8")
+            # body
+            p1 = T.allocate([390336], "int8", "global")
+            p2 = T.allocate([80], "uint8", "global")
+            p3 = T.allocate([64], "uint8", "global")
+            p4 = T.allocate([390336], "int8", "global")
+            p5 = T.allocate([96], "uint8", "global")
+            p6 = T.allocate([32], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p2[0], 80, 0, p3[0], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 3, 214, 0, 114, buffer3[0], 0, 0, 0, T.float32(0.104816), -128, "NHWC", 342, 3, 1, 3, 1, 1, 1, 1, 2, p5[0], 96, 0, p6[0], 32, 0, 1, 0, 1, "CLIP", -128, 127, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+
+    test_mod = CopyComputeReordering(1)(OperatorsWithAndWithoutWeights)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+def test_operators_with_and_without_weights_max_copy_movements_2():
+    # fmt: off
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main() -> None:
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})  
+            buffer1 = T.buffer_decl([97156], "int8")
+            buffer2 = T.buffer_decl([80], "uint8")
+            buffer3 = T.buffer_decl([64], "uint8")
+            buffer4 = T.buffer_decl([96], "uint8")
+            buffer5 = T.buffer_decl([32], "uint8")
+            # body
+            p1 = T.allocate([390336], "int8", "global")
+            p2 = T.allocate([80], "uint8", "global")
+            p3 = T.allocate([64], "uint8", "global")
+            p4 = T.allocate([390336], "int8", "global")
+            p5 = T.allocate([96], "uint8", "global")
+            p6 = T.allocate([32], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p2[0], 80, 0, p3[0], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 3, 214, 0, 114, buffer3[0], 0, 0, 0, T.float32(0.104816), -128, "NHWC", 342, 3, 1, 3, 1, 1, 1, 1, 2, p5[0], 96, 0, p6[0], 32, 0, 1, 0, 1, "CLIP", -128, 127, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+
+    test_mod = CopyComputeReordering(2)(OperatorsWithAndWithoutWeights)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+# fmt: off
+@tvm.script.ir_module
+class CopyToBufferWithLocalScope:
+    @T.prim_func
+    def main() -> None:
+        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})  
+        buffer1 = T.buffer_decl([64], "uint8")
+        buffer2 = T.buffer_decl([48], "uint8")
+        buffer3 = T.buffer_decl([48], "uint8")
+        buffer4 = T.buffer_decl([256], "uint8")
+        buffer5 = T.buffer_decl([16], "uint8")
+        buffer6 = T.buffer_decl([48], "uint8")
+        buffer7 = T.buffer_decl([256], "uint8")
+        buffer8 = T.buffer_decl([64], "uint8")
+        # body
+        p1 = T.allocate([48], "uint8", "global")
+        p2 = T.allocate([48], "uint8", "global")
+        p3 = T.allocate([256], "int8", "local")
+        p4 = T.allocate([256], "int8", "global")
+        p5 = T.allocate([16], "uint8", "global")
+        p6 = T.allocate([48], "uint8", "global")
+        p7 = T.allocate([256], "int8", "local")
+        T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer1[0], 0, 0, 0, T.float32(0.00392081), -128, "NHWC", 16, 4, 1, "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.00839574), -128, "NHCWB16", 64, 16, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, 0, p2[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 16, p5[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 48, p6[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 256, p7[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.0078125), 0, "NHCWB16", 64, 16, 1, "int8", 4, 4, 4, 4, 0, 4, buffer8[0], 0, 0, 0, T.float32(0.00372155), -128, "NHWC", 16, 4, 1, 1, 1, 1, 1, 1, 1, p5[0], 16, 0, p6[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+# fmt: on
+
+
+def test_copy_to_buffer_with_local_scope_max_copy_movements_0():
+    test_mod = CopyComputeReordering(0)(CopyToBufferWithLocalScope)
+    reference_mod = CopyToBufferWithLocalScope
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+@pytest.mark.parametrize("max_copy_movements", [1, 2])
+def test_copy_to_buffer_with_local_scope_max_copy_movements_n(max_copy_movements):
+    # fmt: off
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main() -> None:
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            buffer1 = T.buffer_decl([64], "uint8")
+            buffer2 = T.buffer_decl([48], "uint8")
+            buffer3 = T.buffer_decl([48], "uint8")
+            buffer4 = T.buffer_decl([256], "uint8")
+            buffer5 = T.buffer_decl([16], "uint8")
+            buffer6 = T.buffer_decl([48], "uint8")
+            buffer7 = T.buffer_decl([256], "uint8")
+            buffer8 = T.buffer_decl([64], "uint8")
+            # body
+            p1 = T.allocate([48], "uint8", "global")
+            p2 = T.allocate([48], "uint8", "global")
+            p3 = T.allocate([256], "int8", "local")
+            p4 = T.allocate([256], "int8", "global")
+            p5 = T.allocate([16], "uint8", "global")
+            p6 = T.allocate([48], "uint8", "global")
+            p7 = T.allocate([256], "int8", "local")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 16, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 48, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer1[0], 0, 0, 0, T.float32(0.00392081), -128, "NHWC", 16, 4, 1, "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.00839574), -128, "NHCWB16", 64, 16, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, 0, p2[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 256, p7[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.0078125), 0, "NHCWB16", 64, 16, 1, "int8", 4, 4, 4, 4, 0, 4, buffer8[0], 0, 0, 0, T.float32(0.00372155), -128, "NHWC", 16, 4, 1, 1, 1, 1, 1, 1, 1, p5[0], 16, 0, p6[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+
+    test_mod = CopyComputeReordering(max_copy_movements)(CopyToBufferWithLocalScope)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+def test_multiple_prim_funcs():
+    # fmt: off
+    @tvm.script.ir_module
+    class InputModule:
+        @T.prim_func
+        def main():
+            T.evaluate(0)
+
+        @T.prim_func
+        def abc():
+            T.evaluate(0)
+    # fmt: on
+
+    err_rgx = (
+        r"Expected a single primitive function called 'main'. "
+        r"Please run the CopyComputeReordering pass in conjunction with the LowerToTIR\(\) pass."
+    )
+    with pytest.raises(tvm.TVMError, match=err_rgx):
+        CopyComputeReordering(1)(InputModule)
+
+
+def test_no_main_prim_func():
+    # fmt: off
+    @tvm.script.ir_module
+    class InputModule:
+        @T.prim_func
+        def abs():
+            T.evaluate(0)
+    # fmt: on
+
+    err_rgx = (
+        r"Expected a single primitive function called 'main'. "
+        r"Please run the CopyComputeReordering pass in conjunction with the LowerToTIR\(\) pass."
+    )
+    with pytest.raises(tvm.TVMError, match=err_rgx):
+        CopyComputeReordering(1)(InputModule)
+
+
+def test_default_max_copy_movements():
+    # fmt: off
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main() -> None:
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            buffer1 = T.buffer_decl([97156], "int8")
+            buffer2 = T.buffer_decl([80], "uint8")
+            buffer3 = T.buffer_decl([64], "uint8")
+            buffer4 = T.buffer_decl([96], "uint8")
+            buffer5 = T.buffer_decl([32], "uint8")
+            # body
+            p1 = T.allocate([390336], "int8", "global")
+            p2 = T.allocate([80], "uint8", "global")
+            p3 = T.allocate([64], "uint8", "global")
+            p4 = T.allocate([390336], "int8", "global")
+            p5 = T.allocate([96], "uint8", "global")
+            p6 = T.allocate([32], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p2[0], 80, 0, p3[0], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 3, 214, 0, 114, buffer3[0], 0, 0, 0, T.float32(0.104816), -128, "NHWC", 342, 3, 1, 3, 1, 1, 1, 1, 2, p5[0], 96, 0, p6[0], 32, 0, 1, 0, 1, "CLIP", -128, 127, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+
+    test_mod = CopyComputeReordering()(OperatorsWithAndWithoutWeights)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+def test_pass_context_option_max_copy_movements():
+    # fmt: off
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main() -> None:
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})  
+            buffer1 = T.buffer_decl([97156], "int8")
+            buffer2 = T.buffer_decl([80], "uint8")
+            buffer3 = T.buffer_decl([64], "uint8")
+            buffer4 = T.buffer_decl([96], "uint8")
+            buffer5 = T.buffer_decl([32], "uint8")
+            # body
+            p1 = T.allocate([390336], "int8", "global")
+            p2 = T.allocate([80], "uint8", "global")
+            p3 = T.allocate([64], "uint8", "global")
+            p4 = T.allocate([390336], "int8", "global")
+            p5 = T.allocate([96], "uint8", "global")
+            p6 = T.allocate([32], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p2[0], 80, 0, p3[0], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 3, 214, 0, 114, buffer3[0], 0, 0, 0, T.float32(0.104816), -128, "NHWC", 342, 3, 1, 3, 1, 1, 1, 1, 2, p5[0], 96, 0, p6[0], 32, 0, 1, 0, 1, "CLIP", -128, 127, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+
+    with tvm.transform.PassContext(
+        config={"tir.contrib.ethos-u.copy_compute_reordering_max_copy_movements": 2}
+    ):
+        test_mod = CopyComputeReordering()(OperatorsWithAndWithoutWeights)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/contrib/test_ethosu/test_encode_constants.py b/tests/python/contrib/test_ethosu/test_encode_constants.py
index 92e6cd3e19cb8..15b719f33c3f9 100644
--- a/tests/python/contrib/test_ethosu/test_encode_constants.py
+++ b/tests/python/contrib/test_ethosu/test_encode_constants.py
@@ -37,33 +37,34 @@ class WeightStreamOnlyU55:
     def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.buffer_decl([128], "uint8")
-        buffer_1 = T.buffer_decl([32], "uint8")
-        buffer_2 = T.buffer_decl([112], "uint8")
-        buffer_3 = T.buffer_decl([32], "uint8")
-        buffer_4 = T.buffer_decl([112], "uint8")
-        buffer_5 = T.buffer_decl([32], "uint8")
-        buffer_6 = T.buffer_decl([112], "uint8")
-        buffer_7 = T.buffer_decl([32], "uint8")
+        buffer1 = T.buffer_decl([128], "uint8")
+        buffer2 = T.buffer_decl([32], "uint8")
+        buffer3 = T.buffer_decl([112], "uint8")
+        buffer4 = T.buffer_decl([32], "uint8")
+        buffer5 = T.buffer_decl([112], "uint8")
+        buffer6 = T.buffer_decl([32], "uint8")
+        buffer7 = T.buffer_decl([112], "uint8")
+        buffer8 = T.buffer_decl([32], "uint8")
         T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data)
         T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data)
         # body
-        p1_global = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p2_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p1_global_1 = T.buffer_decl([112], dtype="uint8", data=p1_global.data)
-        p2_global_1 = T.buffer_decl([32], dtype="uint8", data=p2_global.data)
-        T.evaluate(T.call_extern("ethosu_copy", buffer[0], 128, p1_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 32, p2_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global[0], 128, T.int8(-1), T.int8(-1), 12, p2_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 112, p1_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 32, p2_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, T.int8(-1), T.int8(-1), 12, p2_global_1[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_4[0], 112, p1_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_5[0], 32, p2_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, T.int8(-1), T.int8(-1), 12, p2_global_1[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_6[0], 112, p1_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_7[0], 32, p2_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, T.int8(-1), T.int8(-1), 12, p2_global_1[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        p1 = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p3 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p4 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
+        buffer9 = T.buffer_decl([112], "uint8", data=p1.data)
+        T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 128, p1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 32, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 112, p3[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 32, p4[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, T.int8(-1), T.int8(-1), 12, p2[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 112, buffer9[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 32, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, T.int8(-1), T.int8(-1), 12, p4[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 112, p3[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 32, p4[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, buffer9[0], 112, T.int8(-1), T.int8(-1), 12, p2[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, T.int8(-1), T.int8(-1), 12, p4[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -74,37 +75,34 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         # buffer definition
-        buffer_encoded = T.buffer_decl([160], dtype="uint8")
-        buffer_encoded_1 = T.buffer_decl([32], dtype="uint8")
-        buffer_encoded_2 = T.buffer_decl([160], dtype="uint8")
-        buffer_encoded_3 = T.buffer_decl([32], dtype="uint8")
-        buffer_encoded_4 = T.buffer_decl([176], dtype="uint8")
-        buffer_encoded_5 = T.buffer_decl([32], dtype="uint8")
-        buffer_encoded_6 = T.buffer_decl([160], dtype="uint8")
-        buffer_encoded_7 = T.buffer_decl([32], dtype="uint8")
         T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
         T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
+        buffer_encoded_1 = T.buffer_decl([160], dtype="uint8")
+        buffer_encoded_1_1 = T.buffer_decl([32], dtype="uint8")
+        buffer_encoded_2_1 = T.buffer_decl([160], dtype="uint8")
+        buffer_encoded_3_1 = T.buffer_decl([32], dtype="uint8")
+        buffer_encoded_4_1 = T.buffer_decl([176], dtype="uint8")
+        buffer_encoded_5_1 = T.buffer_decl([32], dtype="uint8")
+        buffer_encoded_6_1 = T.buffer_decl([160], dtype="uint8")
+        buffer_encoded_7_1 = T.buffer_decl([32], dtype="uint8")
         # body
         placeholder_global = T.allocate([176], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_global_1 = T.buffer_decl([160], dtype="uint8", data=placeholder_global.data)
-        placeholder_global_2 = T.buffer_decl([160], dtype="uint8", data=placeholder_global.data)
-        placeholder_global_3 = T.buffer_decl([160], dtype="uint8", data=placeholder_global.data)
         placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_d_global_1 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data)
-        placeholder_d_global_2 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data)
-        placeholder_d_global_3 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data)
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded[0], 160, placeholder_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 32, placeholder_d_global[0], dtype="handle"))
+        placeholder_global_2 = T.allocate([160], "uint8", "global", annotations={"disable_lower_builtin":True})
+        placeholder_d_global_2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
+        placeholder_global_1 = T.buffer_decl([160], dtype="uint8", data=placeholder_global.data)
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 160, placeholder_global_1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1_1[0], 32, placeholder_d_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2_1[0], 160, placeholder_global_2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3_1[0], 32, placeholder_d_global_2[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 80, placeholder_global_1[80], 80, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2[0], 160, placeholder_global_2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3[0], 32, placeholder_d_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 80, placeholder_global_2[80], 80, 12, placeholder_d_global_1[0], 16, placeholder_d_global_1[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4[0], 176, placeholder_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5[0], 32, placeholder_d_global_2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 96, placeholder_global[96], 80, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6[0], 160, placeholder_global_3[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7[0], 32, placeholder_d_global_3[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_3[0], 80, placeholder_global_3[80], 80, 12, placeholder_d_global_3[0], 16, placeholder_d_global_3[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4_1[0], 176, placeholder_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5_1[0], 32, placeholder_d_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 80, placeholder_global_2[80], 80, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6_1[0], 160, placeholder_global_2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7_1[0], 32, placeholder_d_global_2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 96, placeholder_global[96], 80, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 80, placeholder_global_2[80], 80, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -172,19 +170,21 @@ class RereadWeightsU55:
     def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.buffer_decl([304], "uint8")
-        buffer_1 = T.buffer_decl([80], "uint8")
+        buffer1 = T.buffer_decl([304], "uint8")
+        buffer2 = T.buffer_decl([80], "uint8")
         T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data)
         T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data)
         # body
-        placeholder_global = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_d_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
-        T.evaluate(T.call_extern("ethosu_copy", buffer[0], 304, placeholder_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 80, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer[0], 304, placeholder_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 80, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        p1 = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p2 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p3 = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p4 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
+        T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 304, p1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 304, p3[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p4[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 304, T.int8(-1), T.int8(-1), 12, p2[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 304, T.int8(-1), T.int8(-1), 12, p4[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -195,20 +195,20 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         # buffer definition
-        placeholder_encoded = T.buffer_decl([368], dtype="uint8")
-        placeholder_encoded_1 = T.buffer_decl([96], dtype="uint8")
         T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
         T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
+        placeholder_encoded_1 = T.buffer_decl([368], "uint8")
+        placeholder_encoded_1_2 = T.buffer_decl([96], "uint8")
         # body
         placeholder_global = T.allocate([368], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_global_1 = T.buffer_decl([368], dtype="uint8", data=placeholder_global.data)
         placeholder_d_global = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_d_global_1 = T.buffer_decl([96], dtype="uint8", data=placeholder_d_global.data)
-        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 368, placeholder_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 96, placeholder_d_global[0], dtype="handle"))
+        placeholder_global_1 = T.allocate([368], "uint8", "global", annotations={"disable_lower_builtin":True})
+        placeholder_d_global_1 = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True})
+        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 368, placeholder_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1_2[0], 96, placeholder_d_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 368, placeholder_global_1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1_2[0], 96, placeholder_d_global_1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 192, placeholder_global[192], 176, 12, placeholder_d_global[0], 48, placeholder_d_global[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 368, placeholder_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 96, placeholder_d_global_1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 192, placeholder_global_1[192], 176, 12, placeholder_d_global_1[0], 48, placeholder_d_global_1[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
 
     __tvm_meta__ = None
@@ -374,35 +374,37 @@ class MixedReadU55:
     def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.buffer_decl([592], "uint8")
-        buffer_1 = T.buffer_decl([160], "uint8")
-        buffer_2 = T.buffer_decl([80], "uint8")
-        buffer_3 = T.buffer_decl([32], "uint8")
-        buffer_4 = T.buffer_decl([80], "uint8")
-        buffer_5 = T.buffer_decl([32], "uint8")
-        buffer_6 = T.buffer_decl([80], "uint8")
-        buffer_7 = T.buffer_decl([32], "uint8")
-        buffer_8 = T.buffer_decl([80], "uint8")
-        buffer_9 = T.buffer_decl([32], "uint8")
+        buffer1 = T.buffer_decl([80], "uint8")
+        buffer2 = T.buffer_decl([32], "uint8")
+        buffer3 = T.buffer_decl([80], "uint8")
+        buffer4 = T.buffer_decl([32], "uint8")
+        buffer5 = T.buffer_decl([80], "uint8")
+        buffer6 = T.buffer_decl([32], "uint8")
+        buffer7 = T.buffer_decl([80], "uint8")
+        buffer8 = T.buffer_decl([32], "uint8")
+        buffer9 = T.buffer_decl([592], "uint8")
+        buffer10 = T.buffer_decl([160], "uint8")
         T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data)
         T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data)
         # body
-        ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 80, placeholder_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_4[0], 80, placeholder_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_5[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_6[0], 80, placeholder_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_7[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_8[0], 80, placeholder_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_9[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        p1 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p3 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
+        p4 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p5 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
+        T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 80, p1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 32, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer9[0], 592, T.int8(-1), T.int8(-1), 12, buffer10[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 80, p4[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 32, p5[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 80, T.int8(-1), T.int8(-1), 12, p2[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 80, p1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 32, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 80, T.int8(-1), T.int8(-1), 12, p5[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 80, p4[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 32, p5[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 80, T.int8(-1), T.int8(-1), 12, p2[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 80, T.int8(-1), T.int8(-1), 12, p5[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -412,42 +414,37 @@ class MixedReadU65:
     def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+        T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data)
+        T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data)
         # buffer definition
-        buffer_encoded = T.buffer_decl([96], dtype="uint8")
-        buffer_encoded_1 = T.buffer_decl([32], dtype="uint8")
-        buffer_encoded_2 = T.buffer_decl([96], dtype="uint8")
-        buffer_encoded_3 = T.buffer_decl([32], dtype="uint8")
-        buffer_encoded_4 = T.buffer_decl([96], dtype="uint8")
-        buffer_encoded_5 = T.buffer_decl([32], dtype="uint8")
-        buffer_encoded_6 = T.buffer_decl([96], dtype="uint8")
-        buffer_encoded_7 = T.buffer_decl([32], dtype="uint8")
-        placeholder_encoded = T.buffer_decl([608], dtype="uint8")
-        placeholder_encoded_1 = T.buffer_decl([160], dtype="uint8")
-        T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
-        T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
-        # body
-        ethosu_write_2 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
+        buffer_encoded_1 = T.buffer_decl([96], dtype="uint8")
+        buffer_encoded_1_2 = T.buffer_decl([32], dtype="uint8")
+        placeholder_encoded_1 = T.buffer_decl([608], dtype="uint8")
+        placeholder_encoded_1_2 = T.buffer_decl([160], dtype="uint8")
+        buffer_encoded_2_1 = T.buffer_decl([96], dtype="uint8")
+        buffer_encoded_3_1 = T.buffer_decl([32], dtype="uint8")
+        buffer_encoded_4_1 = T.buffer_decl([96], dtype="uint8")
+        buffer_encoded_5_1 = T.buffer_decl([32], dtype="uint8")
+        buffer_encoded_6_1 = T.buffer_decl([96], dtype="uint8")
+        buffer_encoded_7_1 = T.buffer_decl([32], dtype="uint8")
         placeholder_global = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_global_1 = T.buffer_decl([96], dtype="uint8", data=placeholder_global.data)
-        placeholder_global_2 = T.buffer_decl([96], dtype="uint8", data=placeholder_global.data)
-        placeholder_global_3 = T.buffer_decl([96], dtype="uint8", data=placeholder_global.data)
         placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_d_global_1 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data)
-        placeholder_d_global_2 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data)
-        placeholder_d_global_3 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data)
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_encoded[0], 304, placeholder_encoded[304], 304, 12, placeholder_encoded_1[0], 80, placeholder_encoded_1[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded[0], 96, placeholder_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 32, placeholder_d_global[0], dtype="handle"))
+        ethosu_write_2 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
+        placeholder_global_2 = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True})
+        placeholder_d_global_2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 96, placeholder_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1_2[0], 32, placeholder_d_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_encoded_1[0], 304, placeholder_encoded_1[304], 304, 12, placeholder_encoded_1_2[0], 80, placeholder_encoded_1_2[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2_1[0], 96, placeholder_global_2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3_1[0], 32, placeholder_d_global_2[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 48, placeholder_global[48], 48, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2[0], 96, placeholder_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3[0], 32, placeholder_d_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 48, placeholder_global_1[48], 48, 12, placeholder_d_global_1[0], 16, placeholder_d_global_1[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4[0], 96, placeholder_global_2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5[0], 32, placeholder_d_global_2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 48, placeholder_global_2[48], 48, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6[0], 96, placeholder_global_3[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7[0], 32, placeholder_d_global_3[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_3[0], 48, placeholder_global_3[48], 48, 12, placeholder_d_global_3[0], 16, placeholder_d_global_3[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4_1[0], 96, placeholder_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5_1[0], 32, placeholder_d_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 48, placeholder_global_2[48], 48, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6_1[0], 96, placeholder_global_2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7_1[0], 32, placeholder_d_global_2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 48, placeholder_global[48], 48, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 48, placeholder_global_2[48], 48, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py
index e04cabe79d2fe..f64263ca0623b 100644
--- a/tests/python/contrib/test_ethosu/test_networks.py
+++ b/tests/python/contrib/test_ethosu/test_networks.py
@@ -43,13 +43,13 @@
 @pytest.mark.parametrize(
     "accel_type, model_url, workspace_size",
     [
-        ("ethos-u65-256", MOBILENET_V1_URL, 1423344),
-        ("ethos-u65-256", MOBILENET_V2_URL, 2185584),
-        ("ethos-u55-256", MOBILENET_V1_URL, 1423344),
-        ("ethos-u55-256", MOBILENET_V2_URL, 2185584),
-        ("ethos-u55-128", MOBILENET_V2_URL, 2185584),
-        ("ethos-u55-64", MOBILENET_V2_URL, 2185584),
-        ("ethos-u55-32", MOBILENET_V2_URL, 2185584),
+        ("ethos-u65-256", MOBILENET_V1_URL, 1892704),
+        ("ethos-u65-256", MOBILENET_V2_URL, 2257984),
+        ("ethos-u55-256", MOBILENET_V1_URL, 1892704),
+        ("ethos-u55-256", MOBILENET_V2_URL, 2257984),
+        ("ethos-u55-128", MOBILENET_V2_URL, 2257984),
+        ("ethos-u55-64", MOBILENET_V2_URL, 2257984),
+        ("ethos-u55-32", MOBILENET_V2_URL, 2258000),
     ],
 )
 def test_networks_without_usmp(accel_type, model_url, workspace_size):
@@ -71,8 +71,8 @@ def test_networks_without_usmp(accel_type, model_url, workspace_size):
 @pytest.mark.parametrize(
     "accel_type, model_url, workspace_size",
     [
-        ("ethos-u65-256", MOBILENET_V1_URL, 1205872),
-        ("ethos-u55-256", MOBILENET_V2_URL, 1507152),
+        ("ethos-u65-256", MOBILENET_V1_URL, 1206880),
+        ("ethos-u55-256", MOBILENET_V2_URL, 1509408),
     ],
 )
 def test_networks_with_usmp(accel_type, model_url, workspace_size):
diff --git a/tests/python/contrib/test_ethosu/test_replace_copy.py b/tests/python/contrib/test_ethosu/test_replace_copy.py
index 4f06695b25b10..932df71d24029 100644
--- a/tests/python/contrib/test_ethosu/test_replace_copy.py
+++ b/tests/python/contrib/test_ethosu/test_replace_copy.py
@@ -88,14 +88,14 @@ def main(placeholder_5: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(409
         T.preflattened_buffer(ethosu_write_1, [1, 16, 16, 16], dtype="int8", data=ethosu_write_1.data)
         # body
         placeholder_global_unrolled_iter_0 = T.allocate([416], "uint8", "global", annotations={"disable_lower_builtin": True})
-        placeholder_global_unrolled_iter_1 = T.buffer_decl([272], "uint8", data=placeholder_global_unrolled_iter_0.data)
         placeholder_d_global_unrolled_iter_0 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin": True})
-        placeholder_d_global_unrolled_iter_1 = T.buffer_decl([64], dtype="uint8", data=placeholder_d_global_unrolled_iter_0.data)
+        placeholder_global_unrolled_iter_1 = T.allocate([272], "uint8", "global", annotations={"disable_lower_builtin": True})
+        placeholder_d_global_unrolled_iter_1 = T.allocate([64],  "uint8", "global", annotations={"disable_lower_builtin": True})
         T.evaluate(T.call_extern("ethosu_copy", buffer[0], 416, placeholder_global_unrolled_iter_0[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 112, placeholder_d_global_unrolled_iter_0[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_0[0], 416, T.int8(-1), T.int8(-1), 12, placeholder_d_global_unrolled_iter_0[0], 112, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 272, placeholder_global_unrolled_iter_1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 64, placeholder_d_global_unrolled_iter_1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_0[0], 416, T.int8(-1), T.int8(-1), 12, placeholder_d_global_unrolled_iter_0[0], 112, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 6, 16, 0, 16, ethosu_write_1[10], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_1[0], 272, T.int8(-1), T.int8(-1), 12, placeholder_d_global_unrolled_iter_1[0], 64, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
diff --git a/tests/python/contrib/test_ethosu/test_scheduler.py b/tests/python/contrib/test_ethosu/test_scheduler.py
index 8a83e769141da..4baea26e591ef 100644
--- a/tests/python/contrib/test_ethosu/test_scheduler.py
+++ b/tests/python/contrib/test_ethosu/test_scheduler.py
@@ -180,29 +180,27 @@ def test_schedule_cache_reads():
 @tvm.script.ir_module
 class DiamondGraphTir:
     @T.prim_func
-    def main(input_buffer: T.Buffer[(301056,), "int8"], output_buffer: T.Buffer[(75264,), "int8"]) -> None:
+    def main(placeholder: T.Buffer[(301056,), "int8"], ethosu_write: T.Buffer[(75264,), "int8"]) -> None:
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        T.preflattened_buffer(input_buffer, [1, 56, 56, 96], dtype='int8', data=input_buffer.data)
-        T.preflattened_buffer(output_buffer, [1, 56, 56, 24], dtype='int8', data=output_buffer.data)
-
-        weight_buffer = T.buffer_decl([2608], "uint8")
-        bias_buffer = T.buffer_decl([240], "uint8")
-        weight_buffer2 = T.buffer_decl([736], "uint8")
-        bias_buffer2 = T.buffer_decl([240], "uint8")
-
-        weight_global = T.allocate([2608], "uint8", "global", annotations={"disable_lower_builtin":True})
-        weight_global2 = T.buffer_decl([736], "uint8", data=weight_global.data)
-        bias_global = T.allocate([240], "uint8", "global", annotations={"disable_lower_builtin":True})
-        featuremap_buffer = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin": True})
-        featuremap_buffer2 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin": True})
-
-        T.evaluate(T.call_extern("ethosu_copy", weight_buffer[0], 2608, weight_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", bias_buffer[0], 240, bias_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 96, 56, 0, 56, input_buffer[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 5376, 96, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, weight_global[0], 2608, T.int8(-1), T.int8(-1), 12, bias_global[0], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", weight_buffer2[0], 736, weight_global2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", bias_buffer2[0], 240, bias_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, weight_global2[0], 736, T.int8(-1), T.int8(-1), 12, bias_global[0], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, output_buffer[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "ADD", 0, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="handle"))
+        T.preflattened_buffer(placeholder, [1, 56, 56, 96], dtype='int8', data=placeholder.data)
+        T.preflattened_buffer(ethosu_write, [1, 56, 56, 24], dtype='int8', data=ethosu_write.data)
+        buffer1 = T.buffer_decl([2608], "uint8")
+        buffer2 = T.buffer_decl([240], "uint8")
+        buffer3 = T.buffer_decl([736], "uint8")
+        buffer4 = T.buffer_decl([240], "uint8")
+        p1 = T.allocate([2608], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p2 = T.allocate([240], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p3 = T.allocate([736], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p4 = T.allocate([240], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p5 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True})
+        p6 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True})
+        T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 2608, p1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 240, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 736, p3[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 240, p4[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 96, 56, 0, 56, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 5376, 96, 1, "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, p1[0], 2608, T.int8(-1), T.int8(-1), 12, p2[0], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, p6[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, p3[0], 736, T.int8(-1), T.int8(-1), 12, p4[0], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0,T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, p6[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "ADD", 0, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -218,7 +216,6 @@ def test_schedule_diamond_graph():
 
     test_mod, _ = _lower_to_tir(func, copy_constants())
     reference_mod = DiamondGraphTir
-
     tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
 
 

From f88a10fb00419c51a116a63f931a98d8286b23de Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Wed, 18 May 2022 14:04:24 +0100
Subject: [PATCH 18/59] [TFLite] Add support to int16 data type in TFLite
 frontend (#10915)

* [TFLite] Add support to int16 data type in TFLite frontend

Add support for int16 data type and int64 biases/accumulators in
the TFLite frontend.

Adjusts TFLite tests to cover int16 convolutions and element-wise;
Fixes a minor typo negtive->negative in the element-wise tests.

* Update src/relay/qnn/op/convolution.cc

Co-authored-by: Elen Kalda <elen.kalda@arm.com>

Co-authored-by: Elen Kalda <elen.kalda@arm.com>
---
 python/tvm/relay/frontend/tflite.py          |  11 +-
 src/relay/qnn/op/convolution.cc              |  48 ++--
 src/relay/qnn/op/dequantize.cc               |   4 +-
 src/relay/qnn/op/quantize.cc                 |   4 +-
 src/relay/qnn/op/requantize.cc               |   8 +-
 tests/python/frontend/tflite/test_forward.py | 257 ++++++++++++++-----
 6 files changed, 235 insertions(+), 97 deletions(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 8d18cc2962ae1..b696bd6d056bc 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -390,6 +390,7 @@ def get_tensor_type_as_numpy(self, tensor_wrapper):
             return {
                 TensorType.UINT8: np.uint8,
                 TensorType.INT8: np.int8,
+                TensorType.INT16: np.int16,
                 TensorType.FLOAT16: np.float16,
                 TensorType.FLOAT32: np.float32,
                 TensorType.INT32: np.int32,
@@ -430,6 +431,8 @@ def get_tensor_type_str(self, tensor_type):
 
         if tensor_type == TensorType.INT8:
             return "int8"
+        if tensor_type == TensorType.INT16:
+            return "int16"
         if tensor_type == TensorType.UINT8:
             return "uint8"
         if tensor_type == TensorType.FLOAT16:
@@ -2149,7 +2152,9 @@ def convert_conv(self, op, conv_type):
             qnn_conv2d_params = dict(params)
             qnn_conv2d_params["input_zero_point"] = input_tensor.qnn_params["zero_point"]
             qnn_conv2d_params["kernel_zero_point"] = weight_tensor.qnn_params["zero_point"]
-            qnn_conv2d_params["out_dtype"] = "int32"
+            qnn_conv2d_params["out_dtype"] = (
+                "int64" if output_tensor_type_str == "int16" else "int32"
+            )
             qnn_conv2d_params["input_scale"] = input_tensor.qnn_params["scale"]
             qnn_conv2d_params["kernel_scale"] = weight_tensor.qnn_params["scale"]
             out = _qnn.op.conv2d(in_expr, weight_expr, **qnn_conv2d_params)
@@ -2160,8 +2165,8 @@ def convert_conv(self, op, conv_type):
         if len(input_tensors) == 3:
             bias_tensor = input_tensors[2]
             bias_tensor_type = bias_tensor.tensor.Type()
-            # bias tensor type should be INT32 (quantization) or FLOAT32
-            assert bias_tensor_type in (TensorType.INT32, TensorType.FLOAT32)
+            # bias tensor type should be INT32 (int8 qnn) or INT64 (int16 qnn) or FLOAT32
+            assert bias_tensor_type in (TensorType.INT32, TensorType.INT64, TensorType.FLOAT32)
             bias_tensor_type_str = self.get_tensor_type_str(bias_tensor_type)
             if self.has_expr(bias_tensor.tensor_idx):
                 bias_expr = self.get_expr(bias_tensor.tensor_idx)
diff --git a/src/relay/qnn/op/convolution.cc b/src/relay/qnn/op/convolution.cc
index 8a7521e8ee507..42e4540f0f2c2 100644
--- a/src/relay/qnn/op/convolution.cc
+++ b/src/relay/qnn/op/convolution.cc
@@ -50,12 +50,14 @@ bool QnnConv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   if (data == nullptr || weight == nullptr) return false;
   const auto* param = attrs.as<Conv2DAttrs>();
   ICHECK(param != nullptr) << "Conv2DAttrs cannot be nullptr.";
-  ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8))
-      << "Expected qnn conv2d type(int8, uint8) for input but was " << data->dtype;
+  ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8) ||
+         data->dtype == DataType::Int(16))
+      << "Expected qnn conv2d type(int8, uint8, int16) for input but was " << data->dtype;
   ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8))
       << "Expected qnn conv2d type(int8, uint8) for weight but was " << weight->dtype;
-  ICHECK(param->out_dtype == DataType::Int(16) || param->out_dtype == DataType::Int(32))
-      << "Expected qnn conv2d type(int32, int16) for output but was " << param->out_dtype;
+  ICHECK(param->out_dtype == DataType::Int(16) || param->out_dtype == DataType::Int(32) ||
+         param->out_dtype == DataType::Int(64))
+      << "Expected qnn conv2d type(int16, int32, int64) for output but was " << param->out_dtype;
   ICHECK(param->out_dtype.bits() > 0) << "Output dtype bits should be greater than 0.";
 
   // Check the types of scale and zero points.
@@ -190,19 +192,21 @@ WorkloadType GetWorkload(const Array<tvm::relay::Type>& arg_types, const Conv2DA
  */
 Expr Conv2DFallBack(const Expr& data, const Expr& weight, const Expr& input_zero_point,
                     const Expr& kernel_zero_point, const Conv2DAttrs* param) {
-  // Upcast the zero point to Int16.
-  auto zp_data = Cast(input_zero_point, DataType::Int(16));
-  auto zp_kernel = Cast(kernel_zero_point, DataType::Int(16));
+  // Upcast the parameters to be at least int32 to avoid overflow
+  auto upcast_bits = param->out_dtype.bits() < 32 ? 32 : param->out_dtype.bits();
 
-  auto shifted_data = Cast(data, DataType::Int(16));
-  auto zero_scalar = MakeConstantScalar(DataType::Int(32), 0);
+  auto zp_data = Cast(input_zero_point, DataType::Int(upcast_bits));
+  auto zp_kernel = Cast(kernel_zero_point, DataType::Int(upcast_bits));
+
+  auto shifted_data = Cast(data, DataType::Int(upcast_bits));
+  auto zero_scalar = MakeConstantScalar(DataType::Int(upcast_bits), 0);
   if (!IsEqualScalar(input_zero_point, zero_scalar)) {
-    shifted_data = Subtract(Cast(data, DataType::Int(16)), zp_data);
+    shifted_data = Subtract(Cast(data, DataType::Int(upcast_bits)), zp_data);
   }
 
-  auto shifted_kernel = Cast(weight, DataType::Int(16));
+  auto shifted_kernel = Cast(weight, DataType::Int(upcast_bits));
   if (!IsEqualScalar(kernel_zero_point, zero_scalar)) {
-    shifted_kernel = Subtract(Cast(weight, DataType::Int(16)), zp_kernel);
+    shifted_kernel = Subtract(Cast(weight, DataType::Int(upcast_bits)), zp_kernel);
   }
 
   return Conv2D(shifted_data, shifted_kernel, param->strides, param->padding, param->dilation,
@@ -557,6 +561,7 @@ Expr Conv2DThirdTerm(const Expr& weight, const Expr& input_zero_point, const Con
  * \param in_channels The number of input channels.
  * \param kernel_h The height of kernel.
  * \param kernel_w The width of kernel.
+ * \param param The qnn conv2d attributes.
  * \return The sequence of Relay operators for term4.
  * \note The term4 looks like this
  *
@@ -564,10 +569,11 @@ Expr Conv2DThirdTerm(const Expr& weight, const Expr& input_zero_point, const Con
  *
  */
 Expr Conv2DFourthTerm(int input_zero_point_int, int kernel_zero_point_int, int in_channels,
-                      int kernel_h, int kernel_w) {
+                      int kernel_h, int kernel_w, const Conv2DAttrs* param) {
+  auto upcast_bits = param->out_dtype.bits() < 32 ? 32 : param->out_dtype.bits();
   int scalar_term4 =
       input_zero_point_int * kernel_zero_point_int * in_channels * kernel_h * kernel_w;
-  return MakeConstantScalar(DataType::Int(32), scalar_term4);
+  return MakeConstantScalar(DataType::Int(upcast_bits), scalar_term4);
 }
 
 /*
@@ -578,6 +584,7 @@ Expr Conv2DFourthTerm(int input_zero_point_int, int kernel_zero_point_int, int i
  * \param in_channels The number of input channels.
  * \param kernel_h The height of kernel.
  * \param kernel_w The width of kernel.
+ * \param param The qnn conv2d attributes.
  * \return The sequence of Relay operators for term4.
  * \note The term4 looks like this
  *
@@ -585,8 +592,10 @@ Expr Conv2DFourthTerm(int input_zero_point_int, int kernel_zero_point_int, int i
  *
  */
 Expr Conv2DFourthTerm(const Expr& input_zero_point, const Expr& kernel_zero_point, int in_channels,
-                      int kernel_h, int kernel_w) {
-  Expr scalar_term4 = MakeConstantScalar(DataType::Int(32), in_channels * kernel_h * kernel_w);
+                      int kernel_h, int kernel_w, const Conv2DAttrs* param) {
+  auto upcast_bits = param->out_dtype.bits() < 32 ? 32 : param->out_dtype.bits();
+  Expr scalar_term4 =
+      MakeConstantScalar(DataType::Int(upcast_bits), in_channels * kernel_h * kernel_w);
   Expr variable_term4 = Multiply(input_zero_point, kernel_zero_point);
   return Multiply(scalar_term4, variable_term4);
 }
@@ -791,10 +800,11 @@ Expr QnnConv2DCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
   auto term3 = Conv2DThirdTerm(weight, input_zero_point, param, out_channels);
   Expr term4;
   if (dynamic_zp) {
-    term4 = Conv2DFourthTerm(input_zero_point, kernel_zero_point, in_channels, kernel_h, kernel_w);
+    term4 = Conv2DFourthTerm(input_zero_point, kernel_zero_point, in_channels, kernel_h, kernel_w,
+                             param);
   } else {
     term4 = Conv2DFourthTerm(input_zero_point_int, kernel_zero_point_int, in_channels, kernel_h,
-                             kernel_w);
+                             kernel_w, param);
   }
   return Conv2DCombineTerms(term1, term2, term3, term4, input_zero_point_int,
                             kernel_zero_point_int);
@@ -829,7 +839,7 @@ This operator convolves quantized weight with quantized data. The scale of the
 output quantized tensor is the product of the weight_scale and input_scale of
 the input quantized tensors. The zero point of the output quantized tensor is
 0. By default, the dtype of output is int32. Please also refer to Requantize
-operator to understand how to scale back the int32 output to (u)int8.
+operator to understand how to scale back the int32 output to (u)int8 or (u)int16.
 - **data**: This depends on the `layout` parameter. Input is 4D array of shape
             (batch_size, in_channels, height, width) if `layout` is `NCHW`.
 - **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
diff --git a/src/relay/qnn/op/dequantize.cc b/src/relay/qnn/op/dequantize.cc
index 9a9c60d9ea6f0..1ddcde81234d5 100644
--- a/src/relay/qnn/op/dequantize.cc
+++ b/src/relay/qnn/op/dequantize.cc
@@ -47,8 +47,8 @@ bool DequantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 
   const auto input_dtype = data->dtype;
   ICHECK(input_dtype == DataType::Int(8) || input_dtype == DataType::UInt(8) ||
-         input_dtype == DataType::Int(32))
-      << "Input type should be one of the quantized types [unit8, int8, int32] but was "
+         input_dtype == DataType::Int(16) || input_dtype == DataType::Int(32))
+      << "Input type should be one of the quantized types [unit8, int8, int16, int32] but was "
       << input_dtype;
 
   const auto* dequantize_attrs = attrs.as<DequantizeAttrs>();
diff --git a/src/relay/qnn/op/quantize.cc b/src/relay/qnn/op/quantize.cc
index 1a4c853d8929b..06a73ee91cbf5 100644
--- a/src/relay/qnn/op/quantize.cc
+++ b/src/relay/qnn/op/quantize.cc
@@ -76,8 +76,8 @@ bool QuantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   const Array<tvm::PrimExpr> oshape = data->shape;
   const DataType out_dtype = quantize_attrs->out_dtype;
   ICHECK(out_dtype == DataType::Int(8) || out_dtype == DataType::UInt(8) ||
-         out_dtype == DataType::Int(32))
-      << "Output type should be one of [int8, unit8, int32] but was " << out_dtype;
+         out_dtype == DataType::Int(16) || out_dtype == DataType::Int(32))
+      << "Output type should be one of [int8, unit8, int16, int32] but was " << out_dtype;
   // assign output type
   reporter->Assign(types[3], TensorType(oshape, out_dtype));
   return true;
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index ea143fe417136..8601264f53130 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -480,8 +480,8 @@ bool RequantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   }
   const auto in_dtype = data->dtype;
   ICHECK(in_dtype == DataType::Int(8) || in_dtype == DataType::UInt(8) ||
-         in_dtype == DataType::Int(32))
-      << "Input type should be one of [int8, uint8, int32] but was " << in_dtype;
+         in_dtype == DataType::Int(32) || in_dtype == DataType::Int(64))
+      << "Input type should be one of [int8, uint8, int32, int64] but was " << in_dtype;
 
   const RequantizeAttrs* requantize_attrs = attrs.as<RequantizeAttrs>();
   int axis = requantize_attrs->axis;
@@ -507,8 +507,8 @@ bool RequantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   // assign output type
   auto out_dtype = requantize_attrs->out_dtype;
   ICHECK(out_dtype == DataType::Int(8) || out_dtype == DataType::UInt(8) ||
-         out_dtype == DataType::Int(32))
-      << "Output type should be one of [int8, uint8, int32] but was " << out_dtype;
+         out_dtype == DataType::Int(16) || out_dtype == DataType::Int(32))
+      << "Output type should be one of [int8, uint8, int16, int32] but was " << out_dtype;
   reporter->Assign(types[5], TensorType(oshape, out_dtype));
   return true;
 }
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 80cdcf327f4b8..8c8ca0eab2ffa 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -139,19 +139,38 @@ def vmobj_to_list(o):
 
 
 def _quantize_keras_model(
-    keras_model, representative_data_gen, is_float_input=False, is_float_output=False
+    keras_model,
+    representative_data_gen,
+    is_float_input=False,
+    is_float_output=False,
+    int_quant_dtype=tf.int8,
 ):
     """Utility function to quantize a Keras model using TFLite converter."""
     converter = interpreter_wrapper.TFLiteConverter.from_keras_model(keras_model)
-    converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
-    converter.representative_dataset = representative_data_gen
-    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+    if int_quant_dtype == tf.int8:
+        converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
+        converter.representative_dataset = representative_data_gen
+        converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+        inference_dtype = tf.uint8
+    elif int_quant_dtype == tf.int16:
+        converter.optimizations = [tf.lite.Optimize.DEFAULT]
+        converter.representative_dataset = representative_data_gen
+        converter.target_spec.supported_ops = [
+            tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+        ]
+        inference_dtype = tf.uint16
+    else:
+        raise RuntimeError(
+            f"Invalid quantized dtype {int_quant_dtype}. Supported types: int8, int16."
+        )
+
     # NOTE: If representative dataset is provided, and inference input type is not set,
     #       then converter will self add quant & dequant Op accordingly.
     if not is_float_input:
-        converter.inference_input_type = tf.uint8
+        converter.inference_input_type = inference_dtype
     if not is_float_output:
-        converter.inference_output_type = tf.uint8
+        converter.inference_output_type = inference_dtype
+
     return converter.convert()
 
 
@@ -271,6 +290,7 @@ def compare_tflite_with_tvm(
     mode="graph_executor",
     experimental_new_converter=False,
     fp16_quantized=False,
+    int_quant_dtype=tf.int8,
 ):
     """Generic function to generate and compare TFLite and TVM output"""
     in_data = convert_to_list(in_data)
@@ -287,7 +307,15 @@ def compare_tflite_with_tvm(
         converter = tf.lite.TFLiteConverter.from_session(sess, input_tensors, output_tensors)
         converter.experimental_new_converter = experimental_new_converter
         if quantized:
-            converter.inference_type = tf.lite.constants.QUANTIZED_UINT8
+            if int_quant_dtype == tf.int16:
+                converter.optimizations = [tf.lite.Optimize.DEFAULT]
+                converter.target_spec.supported_ops = [
+                    tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+                ]
+            else:
+                # default to int8 quantization
+                converter.inference_type = tf.lite.constants.QUANTIZED_UINT8
+
             input_arrays = converter.get_input_arrays()
             input_stats = {}
             # calculate the mean and quantization scale for every input tensor,
@@ -875,7 +903,7 @@ def test_forward_l2_pool2d():
 
 
 def _test_tflite2_quantized_convolution(
-    input_shape, kernel_shape, dilations, strides, padding, data_format
+    input_shape, kernel_shape, filters, padding="valid", data_format=None, int_quant_dtype=tf.int8
 ):
     """One iteration of TFLite2 quantized convolution with given shapes and attributes"""
     data_format = "channels_last" if "NHWC" else "channels_first"
@@ -884,23 +912,26 @@ def _test_tflite2_quantized_convolution(
 
     data_in = tf.keras.layers.Input(shape=data.shape[1:])
     conv = tf.keras.layers.Conv2D(
-        filters=kernel_shape[3],
+        filters=filters,
         kernel_size=(kernel_shape[0], kernel_shape[1]),
-        strides=strides,
+        activation=tf.nn.relu,
         padding=padding,
         data_format=data_format,
-        activation="relu",
-        use_bias=False,
     )(data_in)
     keras_model = tf.keras.models.Model(data_in, conv)
-    keras_model.layers[1].set_weights([kernel])
 
     # To create quantized values with dynamic range of activations, needs representative dataset
     def representative_data_gen():
         for i in range(1):
             yield [data]
 
-    tflite_model_quant = _quantize_keras_model(keras_model, representative_data_gen)
+    tflite_model_quant = _quantize_keras_model(
+        keras_model,
+        representative_data_gen,
+        is_float_input=True,
+        is_float_output=True,
+        int_quant_dtype=int_quant_dtype,
+    )
 
     tflite_output = run_tflite_graph(tflite_model_quant, data)
     tvm_output = run_tvm_graph(tflite_model_quant, data, data_in.name.replace(":0", ""))
@@ -909,6 +940,25 @@ def representative_data_gen():
     )
 
 
+def test_forward_quantized_convolution():
+    for int_quant_dtype in [tf.int8, tf.int16]:
+        _test_tflite2_quantized_convolution(
+            (1, 28, 28, 1),
+            (1, 1),
+            12,
+            data_format="NHWC",
+            int_quant_dtype=int_quant_dtype,
+        )
+
+        _test_tflite2_quantized_convolution(
+            (1, 1, 28, 28),
+            (1, 1),
+            12,
+            data_format="NCWH",
+            int_quant_dtype=int_quant_dtype,
+        )
+
+
 def _test_tflite2_quantized_depthwise_convolution(
     input_shape, kernel_shape, dilations, strides, padding, data_format, depth_multiplier
 ):
@@ -1046,7 +1096,6 @@ def _test_convolution(
                     quantized=quantized,
                     input_range=input_range,
                     experimental_new_converter=True,
-                    fp16_quantized=fp16_quantized,
                 )
         else:
             data_array = np.reshape(data_array, tensor_in_sizes).astype("float32")
@@ -1765,7 +1814,7 @@ def test_forward_concatenation():
 # --------------
 
 
-def _test_unary_elemwise(math_op, data, quantized, quant_range=[-6, 6]):
+def _test_unary_elemwise(math_op, data, quantized, quant_range=[-6, 6], int_quant_dtype=tf.int8):
     """One iteration of unary elemwise"""
     if quantized:
         with tf.Graph().as_default():
@@ -1787,6 +1836,7 @@ def _test_unary_elemwise(math_op, data, quantized, quant_range=[-6, 6]):
                 quantized=True,
                 input_range=input_range,
                 experimental_new_converter=True,
+                int_quant_dtype=int_quant_dtype,
             )
     else:
         with tf.Graph().as_default():
@@ -1795,14 +1845,20 @@ def _test_unary_elemwise(math_op, data, quantized, quant_range=[-6, 6]):
             compare_tflite_with_tvm(data, ["in:0"], [in_data], [out])
 
 
-def _unary_elewise_create_model(math_op, data, offset=0):
+def _unary_elewise_create_model(math_op, data, offset=0, int_quant_dtype=tf.int8):
     class Model(tf.Module):
         @tf.function
         def tf_function(self, x):
             op = math_op(x)
             return op
 
-    dtype = "int8"
+    if int_quant_dtype in (tf.int8, tf.uint8):
+        dtype = "int8"
+    elif int_quant_dtype in (tf.int16, tf.uint16):
+        dtype = "int16"
+    else:
+        raise Exception(f"Unsupported dtype '{int_quant_dtype}' for unary elementwise test.")
+
     model = Model()
 
     # Save the model
@@ -1824,9 +1880,17 @@ def representative_dataset():
     converter = tf.lite.TFLiteConverter.from_saved_model(export_dir)
     converter.optimizations = [tf.lite.Optimize.DEFAULT]
     converter.representative_dataset = representative_dataset
-    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
-    converter.inference_input_type = tf.int8
-    converter.inference_output_type = tf.int8
+
+    if int_quant_dtype in (tf.int16, tf.uint16):
+        converter.target_spec.supported_ops = [
+            tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+        ]
+    else:
+        converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+
+    converter.inference_input_type = int_quant_dtype
+    converter.inference_output_type = int_quant_dtype
+
     tflite_model = converter.convert()
     return tflite_model
 
@@ -1836,24 +1900,28 @@ def representative_dataset():
 # ----
 
 
-def _test_abs(data, quantized):
+def _test_abs(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of abs"""
     if quantized:
-        tflite_model_quant = _unary_elewise_create_model(tf.math.abs, data, offset=1)
+        tflite_model_quant = _unary_elewise_create_model(
+            tf.math.abs, data, offset=1, int_quant_dtype=int_quant_dtype
+        )
         tflite_output = run_tflite_graph(tflite_model_quant, data)
 
         # TFLite 2.6.x upgrade support
         if tf.__version__ < LooseVersion("2.6.1"):
             in_node = ["serving_default_input_int8"]
         else:
-            in_node = ["tfl.quantize"]
+            in_node = (
+                ["serving_default_input_int16"] if int_quant_dtype == tf.int16 else ["tfl.quantize"]
+            )
 
         tvm_output = run_tvm_graph(tflite_model_quant, data, in_node)
         tvm.testing.assert_allclose(
             np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-2
         )
     else:
-        return _test_unary_elemwise(math_ops.abs, data, quantized)
+        return _test_unary_elemwise(math_ops.abs, data, quantized, int_quant_dtype=int_quant_dtype)
 
 
 #######################################################################
@@ -1861,14 +1929,18 @@ def _test_abs(data, quantized):
 # ----
 
 
-def _test_rsqrt(data, quantized):
+def _test_rsqrt(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of rsqrt"""
 
     # tensorflow version upgrade support
     if tf.__version__ < LooseVersion("2.6.1") or not quantized:
-        return _test_unary_elemwise(math_ops.rsqrt, data, quantized, quant_range=[1, 6])
+        return _test_unary_elemwise(
+            math_ops.rsqrt, data, quantized, quant_range=[1, 6], int_quant_dtype=int_quant_dtype
+        )
     else:
-        tflite_model_quant = _unary_elewise_create_model(tf.math.rsqrt, data)
+        tflite_model_quant = _unary_elewise_create_model(
+            tf.math.rsqrt, data, int_quant_dtype=int_quant_dtype
+        )
         tflite_output = run_tflite_graph(tflite_model_quant, data)
         in_node = ["tfl.quantize"]
 
@@ -1883,9 +1955,9 @@ def _test_rsqrt(data, quantized):
 # ----
 
 
-def _test_ceil(data, quantized):
+def _test_ceil(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of ceil"""
-    return _test_unary_elemwise(math_ops.ceil, data, quantized)
+    return _test_unary_elemwise(math_ops.ceil, data, quantized, int_quant_dtype=int_quant_dtype)
 
 
 #######################################################################
@@ -1893,9 +1965,9 @@ def _test_ceil(data, quantized):
 # -----
 
 
-def _test_floor(data, quantized):
+def _test_floor(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of floor"""
-    return _test_unary_elemwise(math_ops.floor, data, quantized)
+    return _test_unary_elemwise(math_ops.floor, data, quantized, int_quant_dtype=int_quant_dtype)
 
 
 #######################################################################
@@ -1903,9 +1975,9 @@ def _test_floor(data, quantized):
 # -----
 
 
-def _test_round(data, quantized):
+def _test_round(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of round"""
-    return _test_unary_elemwise(math_ops.round, data, quantized)
+    return _test_unary_elemwise(math_ops.round, data, quantized, int_quant_dtype=int_quant_dtype)
 
 
 #######################################################################
@@ -1913,9 +1985,9 @@ def _test_round(data, quantized):
 # ---
 
 
-def _test_exp(data, quantized):
+def _test_exp(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of exp"""
-    return _test_unary_elemwise(math_ops.exp, data, quantized)
+    return _test_unary_elemwise(math_ops.exp, data, quantized, int_quant_dtype=int_quant_dtype)
 
 
 #######################################################################
@@ -1923,9 +1995,11 @@ def _test_exp(data, quantized):
 # ---
 
 
-def _test_log(data, quantized):
+def _test_log(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of log"""
-    return _test_unary_elemwise(math_ops.log, data, quantized, quant_range=[1, 6])
+    return _test_unary_elemwise(
+        math_ops.log, data, quantized, quant_range=[1, 6], int_quant_dtype=int_quant_dtype
+    )
 
 
 #######################################################################
@@ -1933,9 +2007,9 @@ def _test_log(data, quantized):
 # ---
 
 
-def _test_sin(data, quantized):
+def _test_sin(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of sin"""
-    return _test_unary_elemwise(math_ops.sin, data, quantized)
+    return _test_unary_elemwise(math_ops.sin, data, quantized, int_quant_dtype=int_quant_dtype)
 
 
 #######################################################################
@@ -1943,10 +2017,12 @@ def _test_sin(data, quantized):
 # ---
 
 
-def _test_cos(data, quantized):
+def _test_cos(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of cos"""
     if quantized:
-        tflite_model_quant = _unary_elewise_create_model(tf.math.cos, data)
+        tflite_model_quant = _unary_elewise_create_model(
+            tf.math.cos, data, int_quant_dtype=int_quant_dtype
+        )
         tflite_output = run_tflite_graph(tflite_model_quant, data)
         in_node = ["tfl.quantize"]
         tvm_output = run_tvm_graph(tflite_model_quant, data, in_node)
@@ -1962,9 +2038,9 @@ def _test_cos(data, quantized):
 # ---
 
 
-def _test_tan(data, quantized):
+def _test_tan(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of tan"""
-    return _test_unary_elemwise(math_ops.tan, data, quantized)
+    return _test_unary_elemwise(math_ops.tan, data, quantized, int_quant_dtype=int_quant_dtype)
 
 
 #######################################################################
@@ -1972,9 +2048,9 @@ def _test_tan(data, quantized):
 # ------
 
 
-def _test_square(data, quantized):
+def _test_square(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of square"""
-    return _test_unary_elemwise(math_ops.square, data, quantized)
+    return _test_unary_elemwise(math_ops.square, data, quantized, int_quant_dtype=int_quant_dtype)
 
 
 #######################################################################
@@ -1982,19 +2058,21 @@ def _test_square(data, quantized):
 # ------
 
 
-def _test_neg(data, quantized):
+def _test_neg(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of neg"""
-    return _test_unary_elemwise(math_ops.neg, data, quantized)
+    return _test_unary_elemwise(math_ops.neg, data, quantized, int_quant_dtype=int_quant_dtype)
 
 
 #######################################################################
-# Neg
+# Sqrt
 # ------
 
 
-def _test_sqrt(data, quantized):
+def _test_sqrt(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of sqrt"""
-    return _test_unary_elemwise(math_ops.sqrt, data, quantized, quant_range=[1, 6])
+    return _test_unary_elemwise(
+        math_ops.sqrt, data, quantized, quant_range=[1, 6], int_quant_dtype=int_quant_dtype
+    )
 
 
 #######################################################################
@@ -2002,28 +2080,29 @@ def _test_sqrt(data, quantized):
 # ---
 
 
-def _test_elu(data, quantized):
+def _test_elu(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of elu"""
-    return _test_unary_elemwise(nn_ops.elu, data, quantized)
+    return _test_unary_elemwise(nn_ops.elu, data, quantized, int_quant_dtype=int_quant_dtype)
 
 
-def _test_forward_unary_elemwise(test_op, quant_dtype=None, quantized=True, negtive=True):
+def _test_forward_unary_elemwise(test_op, int_quant_dtype=None, quantized=True, negative=True):
     # input data
     in_data, inq_data = [], []
 
+    np_dtype = int_quant_dtype.as_numpy_dtype if int_quant_dtype else np.uint8
+
     # quantized input data
     if quantized:
-        quant_dtype = quant_dtype or np.uint8
-        inq_data.append(np.arange(1, 240, 40, dtype=quant_dtype))
-        inq_data.append(np.arange(1, 240, 40, dtype=quant_dtype).reshape((2, 1, 3)))
-        if quant_dtype == np.int8:
+        inq_data.append(np.arange(1, 240, 40, dtype=np_dtype))
+        inq_data.append(np.arange(1, 240, 40, dtype=np_dtype).reshape((2, 1, 3)))
+        if int_quant_dtype == np.int8:
             inq_data.append(np.arange(-128, 127, 45, dtype=np.int8))
 
     for data in inq_data:
-        test_op(data, quantized=True)
+        test_op(data, quantized=True, int_quant_dtype=int_quant_dtype)
 
     # normal input data
-    if negtive:
+    if negative:
         in_data.append(np.arange(-2.0, 4.0, dtype=np.float32))
         in_data.append(np.arange(-2.0, 4.0, dtype=np.float32).reshape((2, 1, 3)))
     else:
@@ -2031,30 +2110,31 @@ def _test_forward_unary_elemwise(test_op, quant_dtype=None, quantized=True, negt
         in_data.append(np.arange(1.0, 7.0, dtype=np.float32).reshape((2, 1, 3)))
 
     for data in in_data:
-        test_op(data, quantized=False)
+        test_op(data, quantized=False, int_quant_dtype=int_quant_dtype)
 
 
 def test_all_unary_elemwise():
-    _test_forward_unary_elemwise(_test_abs, quant_dtype=np.int8)
+    _test_forward_unary_elemwise(_test_abs, int_quant_dtype=tf.int8)
+    _test_forward_unary_elemwise(_test_abs, int_quant_dtype=tf.int16)
     _test_forward_unary_elemwise(_test_floor)
     _test_forward_unary_elemwise(_test_exp)
-    _test_forward_unary_elemwise(_test_log, negtive=False)
+    _test_forward_unary_elemwise(_test_log, negative=False)
     _test_forward_unary_elemwise(_test_square)
     _test_forward_unary_elemwise(_test_sin)
     _test_forward_unary_elemwise(_test_neg)
-    _test_forward_unary_elemwise(_test_sqrt, negtive=False)
+    _test_forward_unary_elemwise(_test_sqrt, negative=False)
     # tensorflow version upgrade support
     if tf.__version__ < LooseVersion("2.6.1"):
-        _test_forward_unary_elemwise(_test_rsqrt, negtive=False, quant_dtype=np.uint8)
+        _test_forward_unary_elemwise(_test_rsqrt, negative=False, int_quant_dtype=tf.uint8)
     else:
-        _test_forward_unary_elemwise(_test_rsqrt, negtive=False, quant_dtype=np.int8)
+        _test_forward_unary_elemwise(_test_rsqrt, negative=False, int_quant_dtype=tf.int8)
     # ceil and cos come with TFLite 1.14.0.post1 fbs schema
     if package_version.parse(tf.VERSION) >= package_version.parse("1.14.0"):
         _test_forward_unary_elemwise(_test_ceil)
         if tf.__version__ < LooseVersion("2.6.1"):
             _test_forward_unary_elemwise(_test_cos, quantized=False)
         else:
-            _test_forward_unary_elemwise(_test_cos, quant_dtype=np.int8)
+            _test_forward_unary_elemwise(_test_cos, int_quant_dtype=tf.int8)
         _test_forward_unary_elemwise(_test_round)
         # This fails with TF and Tflite 1.15.2, this could not have been tested
         # in CI or anywhere else. The failure mode is that we see a backtrace
@@ -4572,6 +4652,47 @@ def test_forward_tflite_float16():
     tvm.testing.assert_allclose(tvm_sorted_labels, tflite_sorted_labels)
 
 
+def test_forward_mobilenet_int16():
+    """Test int16 quantized model"""
+    # MobilenetV2
+    model_file = tf_testing.get_workload_official(
+        "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_128.tgz",
+        "mobilenet_v1_0.25_128_frozen.pb",
+    )
+
+    # Test image. Checking the labels because the requantize implementation is different between
+    # TFLite and Relay. This cause final output numbers to mismatch. So, testing accuracy via
+    # labels. Also, giving a real image, instead of random inputs.
+    #
+    # According to TFLite documentation, despite the quantization being done to make this model
+    # use int16 types, inputs and outputs are kept float32 by default.
+    # https://www.tensorflow.org/lite/performance/post_training_integer_quant_16x8
+    data = get_real_image(128, 128, quantized=False)
+
+    converter = tf.lite.TFLiteConverter.from_frozen_graph(
+        model_file, ["input"], ["MobilenetV1/Predictions/Reshape_1"]
+    )
+
+    def representative_dataset():
+        for _ in range(1):
+            yield [data]
+
+    converter.optimizations = [tf.lite.Optimize.DEFAULT]
+    converter.target_spec.supported_ops = [
+        tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+    ]
+    converter.representative_dataset = representative_dataset
+    tflite_model_buf = converter.convert()
+
+    tflite_output = run_tflite_graph(tflite_model_buf, data)
+    tflite_predictions = np.squeeze(tflite_output)
+    tflite_sorted_labels = tflite_predictions.argsort()[-3:][::-1]
+    tvm_output = run_tvm_graph(tflite_model_buf, data, "input")
+    tvm_predictions = np.squeeze(tvm_output)
+    tvm_sorted_labels = tvm_predictions.argsort()[-3:][::-1]
+    tvm.testing.assert_allclose(tvm_sorted_labels, tflite_sorted_labels)
+
+
 #######################################################################
 # Quantized SSD Mobilenet
 # -----------------------
@@ -4867,3 +4988,5 @@ def test_prevent_tensorflow_dynamic_range():
     test_forward_tflite2_qnn_mobilenet_v2()
 
     test_forward_tflite_float16()
+
+    test_forward_tflite_int16()

From 2b1e5ce8dc2a23810f47b2b89e36a61c497f5c7f Mon Sep 17 00:00:00 2001
From: Elen Kalda <elen.kalda@arm.com>
Date: Wed, 18 May 2022 16:37:05 +0100
Subject: [PATCH 19/59] [microNPU] Fix bug in channels extraction in the
 matcher (#11335)

* [microNPU] Fix bug in channels extraction in the matcher

If the input tensor layout is in NHCWB16, we were passing W value
instead of the channels to get_valid_block_configs.

* Add test for conv2d
---
 .../backend/contrib/ethosu/te/convolution.py  |  4 +-
 .../backend/contrib/ethosu/te/depthwise.py    |  7 +-
 .../backend/contrib/ethosu/te/pooling.py      |  8 +-
 .../cascader/test_ethosu_conv2d_matcher.py    | 80 +++++++++++++++++++
 4 files changed, 90 insertions(+), 9 deletions(-)

diff --git a/python/tvm/relay/backend/contrib/ethosu/te/convolution.py b/python/tvm/relay/backend/contrib/ethosu/te/convolution.py
index e309ab5a2af4d..645a0d58221c2 100644
--- a/python/tvm/relay/backend/contrib/ethosu/te/convolution.py
+++ b/python/tvm/relay/backend/contrib/ethosu/te/convolution.py
@@ -287,7 +287,9 @@ def match_ethosu_conv2d(output_tensor, device_config):
     ifm_dtype = input_tensors[0].dtype
     ofm_dtype = output_tensor.dtype
 
-    ifm_channels = int(input_tensors[0].shape[3])
+    # Use channels from the weights tensor since that its shape doesn't change during layout
+    # conversion
+    ifm_channels = int(input_tensors[1].shape[3])
     ofm_channels, kernel_height, kernel_width = (int(axis) for axis in input_tensors[1].shape[0:3])
     kernel_elements = kernel_height * kernel_width
 
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py b/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py
index 03ce0e5349640..344cd64a323d7 100644
--- a/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py
+++ b/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py
@@ -279,8 +279,7 @@ def match_ethosu_depthwise_conv2d(output_tensor, device_config):
     ifm_dtype = input_tensors[0].dtype
     ofm_dtype = output_tensor.dtype
 
-    ifm_channels = int(input_tensors[0].shape[3])
-    ofm_channels, kernel_height, kernel_width = (int(axis) for axis in input_tensors[1].shape[0:3])
+    channels, kernel_height, kernel_width = (int(axis) for axis in input_tensors[1].shape[0:3])
 
     subkernels = len(
         device_config.get_kernel_steps(depthwise2d.op.name, kernel_height, kernel_width, ifm_dtype)
@@ -294,8 +293,8 @@ def match_ethosu_depthwise_conv2d(output_tensor, device_config):
         propagators[0],
         depthwise2d.op.attrs,
         output_tensor.shape,
-        ofm_channels,
-        ifm_channels,
+        channels,
+        channels,
         output_layout,
         input_layout,
         ifm_dtype,
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/pooling.py b/python/tvm/relay/backend/contrib/ethosu/te/pooling.py
index 8c20ea7165265..ca8c2ec9b3957 100644
--- a/python/tvm/relay/backend/contrib/ethosu/te/pooling.py
+++ b/python/tvm/relay/backend/contrib/ethosu/te/pooling.py
@@ -239,8 +239,8 @@ def match_ethosu_pooling(output_tensor, device_config):
     ifm_dtype = input_tensors[0].dtype
     ofm_dtype = output_tensor.dtype
 
-    ifm_channels = int(input_tensors[0].shape[3])
-    ofm_channels = ifm_channels
+    # Use channels from a stage of TE graph where the IFM is always NHWC
+    channels = int(pool2d.shape[3])
     pool_shape_h = int(pool2d.op.attrs["pool_shape_h"])
     pool_shape_w = int(pool2d.op.attrs["pool_shape_w"])
 
@@ -256,8 +256,8 @@ def match_ethosu_pooling(output_tensor, device_config):
         propagators[0],
         pool2d.op.attrs,
         output_tensor.shape,
-        ofm_channels,
-        ifm_channels,
+        channels,
+        channels,
         output_layout,
         input_layout,
         ifm_dtype,
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py
index 17b41cbaf511e..76adb0b4cbd46 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py
@@ -98,5 +98,85 @@ def test_ethosu_conv2d_matcher(
     assert part.propagators[2].offset == scale_bias_offset
 
 
+@pytest.mark.parametrize(
+    "ifm_layout, ofm_layout, ifm_channels, expected_cycles",
+    [
+        ("NHWC", "NHWC", 24, 2304),
+        ("NHCWB16", "NHWC", 12, 2352),
+        ("NHWC", "NHCWB16", 38, 7056),
+        ("NHCWB16", "NHCWB16", 55, 4608),
+    ],
+)
+def test_ethosu_conv2d_block_config_from_matcher(
+    ifm_layout, ofm_layout, ifm_channels, expected_cycles
+):
+    ofm_channels = 10
+    ifm_height = 123
+    ifm_width = 155
+
+    ifm_shape = (
+        (1, ifm_height, ifm_width, ifm_channels)
+        if ifm_layout == "NHWC"
+        else (1, ifm_height, 1 + ((ifm_channels - 1) // 16), ifm_width, 16)
+    )
+    weight_shape = (ofm_channels, 3, 3, ifm_channels)
+    scale_bias_shape = (ofm_channels, 10)
+
+    ifm = te.placeholder(ifm_shape, dtype="int8")
+    weight = te.placeholder(weight_shape, dtype="int8")
+    scale_bias = te.placeholder(scale_bias_shape, dtype="uint8")
+    lut = te.placeholder((), dtype="uint8")
+    out = conv2d_compute(
+        ifm=ifm,
+        weight=weight,
+        scale_bias=scale_bias,
+        lut=lut,
+        ifm_scale=1,
+        ifm_zero_point=0,
+        ofm_scale=1,
+        ofm_zero_point=0,
+        weight_zero_point=0,
+        strides=(1, 1),
+        padding=(0, 0, 0, 0),
+        dilation=(1, 1),
+        activation="NONE",
+        clip_min=0,
+        clip_max=0,
+        upscale="NONE",
+        rounding_mode="TFL",
+        ifm_layout=ifm_layout,
+        ofm_layout=ofm_layout,
+    )
+
+    device_config = cs.EthosuDeviceConfig("ethos-u55-256")
+    part = match_ethosu_conv2d(out, device_config)
+
+    ofm_shape = [int(i) for i in part.subgraph.output_tensor.shape]
+
+    # Add inputs and outputs to the part
+    input_tensor = cs.Tensor(ifm_shape, "int8")
+    part.set_input(0, input_tensor)
+    weight_tensor = cs.Tensor(weight_shape, "int8")
+    part.set_input(1, weight_tensor)
+    scale_bias_tensor = cs.Tensor(scale_bias_shape, "int8")
+    part.set_input(2, scale_bias_tensor)
+    output_tensor = cs.Tensor(ofm_shape, "int8")
+    part.set_output(output_tensor)
+
+    # Create a stripe of a size of the output tensor
+    order = [1, 2, 3, 4] if ofm_layout == "NHWC" else [1, 2, 4, 3, 0]
+    stripes = [1] * len(order)
+    offset = [0] * len(order)
+
+    stripe_config = cs.StripeConfig(ofm_shape, ofm_shape, ofm_shape, order, stripes, offset)
+
+    block = part.get_block_config(stripe_config)
+
+    # Since we dont know the values of the variables we passed to the get_valid_block_configs in
+    # the matcher, best we can do is to verify the compute cycle count since the channels have a
+    # significant effect on it
+    assert block.compute_cycles == expected_cycles
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From fe1090e8aa6b6307f150f46ab968451765a6a079 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 18 May 2022 11:38:55 -0500
Subject: [PATCH 20/59] [TIR] IndexMap Simplification Constraints (#11342)

* [TIR] Added optional arith::Analyzer argument to IndexMap methods

Simplifications done when applying a transformation may require
iteration bounds from the caller scope.  This is a C++ only feature,
because `arith::Analyzer` doesn't inherit from `ObjectRef`, and cannot
be passed through the FFI.

* [TIR] Pass analyzer from TransformLayoutRewriter to IndexMap

Avoid needing to simplify twice, now that IndexMap can accept the
analyzer from the calling scope.

* [TIR] Added BlockNode handling to IRMutatorWithAnalyzer

Iteration variables defined in `BlockNode::iter_vars` may be useful
for simplifications.  This functionality was extracted from
`TransformLayoutRewriter`.
---
 include/tvm/tir/index_map.h                   | 22 ++++++++--
 src/arith/ir_mutator_with_analyzer.cc         |  7 ++++
 src/arith/ir_mutator_with_analyzer.h          |  1 +
 src/tir/ir/index_map.cc                       | 42 ++++++++++++-------
 .../primitive/layout_transformation.cc        | 28 +++++++------
 5 files changed, 70 insertions(+), 30 deletions(-)

diff --git a/include/tvm/tir/index_map.h b/include/tvm/tir/index_map.h
index b6faa67ab53af..315bda259993d 100644
--- a/include/tvm/tir/index_map.h
+++ b/include/tvm/tir/index_map.h
@@ -33,6 +33,12 @@
 
 #include <utility>
 
+namespace tvm {
+namespace arith {
+class Analyzer;
+}
+}  // namespace tvm
+
 namespace tvm {
 namespace tir {
 
@@ -78,10 +84,14 @@ class IndexMapNode : public Object {
    * \param indices The indices in the input space.  Should contain
    * one value for each variable in `initial_indices`.
    *
+   * \param analyzer An optional analyzer to be used to simplify the
+   * resulting expressions.  If null, will use a fresh analyzer.
+   *
    * \returns The indices in the output space.  Contains one value for
    * each expression in `final_indices`.
    */
-  Array<PrimExpr> MapIndices(const Array<PrimExpr>& indices) const;
+  Array<PrimExpr> MapIndices(const Array<PrimExpr>& indices,
+                             arith::Analyzer* analyzer = nullptr) const;
 
   /*! \brief Map a memory range to the output space
    *
@@ -93,20 +103,26 @@ class IndexMapNode : public Object {
    * \param ranges The ranges in the input space.  Should contain one
    * value for each variable in `initial_indices`.
    *
+   * \param analyzer An optional analyzer to be used to simplify the
+   * resulting expressions.  If null, will use a fresh analyzer.
+   *
    * \returns The ranges in the output space.  Contains one value for
    * each expression in `final_indices`.
    */
-  Array<Range> MapRanges(const Array<Range>& ranges) const;
+  Array<Range> MapRanges(const Array<Range>& ranges, arith::Analyzer* analyzer = nullptr) const;
 
   /*! \brief Map a buffer shape to the output space
    *
    * \param shape The buffer shape in the input space.  Should contain
    * one value for each variable in `initial_indices`.
    *
+   * \param analyzer An optional analyzer to be used to simplify the
+   * resulting expressions.  If null, will use a fresh analyzer.
+   *
    * \returns The buffer shape in the output space.  Contains one
    * value for each expression in `final_indices`.
    */
-  Array<PrimExpr> MapShape(const Array<PrimExpr>& shape) const;
+  Array<PrimExpr> MapShape(const Array<PrimExpr>& shape, arith::Analyzer* analyzer = nullptr) const;
 
   /*!
    * \brief Convert to string representation in Python.
diff --git a/src/arith/ir_mutator_with_analyzer.cc b/src/arith/ir_mutator_with_analyzer.cc
index 7bc0d946ade74..9cae3b7a6ac8b 100644
--- a/src/arith/ir_mutator_with_analyzer.cc
+++ b/src/arith/ir_mutator_with_analyzer.cc
@@ -35,6 +35,13 @@ Stmt IRMutatorWithAnalyzer::VisitStmt_(const ForNode* op) {
   return StmtExprMutator::VisitStmt_(op);
 }
 
+Stmt IRMutatorWithAnalyzer::VisitStmt_(const BlockNode* op) {
+  for (const auto& iter_var : op->iter_vars) {
+    analyzer_->Bind(iter_var->var, iter_var->dom);
+  }
+  return StmtExprMutator::VisitStmt_(op);
+}
+
 Stmt IRMutatorWithAnalyzer::VisitStmt_(const LetStmtNode* op) {
   PrimExpr value = this->VisitExpr(op->value);
   if (SideEffect(value) <= CallEffectKind::kPure) {
diff --git a/src/arith/ir_mutator_with_analyzer.h b/src/arith/ir_mutator_with_analyzer.h
index 004265bbe50a6..3bd3a98a84457 100644
--- a/src/arith/ir_mutator_with_analyzer.h
+++ b/src/arith/ir_mutator_with_analyzer.h
@@ -50,6 +50,7 @@ class IRMutatorWithAnalyzer : public tir::StmtExprMutator {
 
   // override functions that need to populate the context information.
   tir::Stmt VisitStmt_(const tir::ForNode* op) override;
+  tir::Stmt VisitStmt_(const tir::BlockNode* op) override;
   tir::Stmt VisitStmt_(const tir::LetStmtNode* op) override;
   tir::Stmt VisitStmt_(const tir::IfThenElseNode* op) override;
   tir::Stmt VisitStmt_(const tir::AttrStmtNode* op) override;
diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc
index 4c0a7d3508c1d..77678d829a8e2 100644
--- a/src/tir/ir/index_map.cc
+++ b/src/tir/ir/index_map.cc
@@ -159,24 +159,29 @@ IndexMap IndexMap::Inverse(Array<Range> initial_ranges) const {
   return IndexMap(output_vars, inverse_exprs);
 }
 
-Array<PrimExpr> IndexMapNode::MapIndices(const Array<PrimExpr>& indices) const {
+Array<PrimExpr> IndexMapNode::MapIndices(const Array<PrimExpr>& indices,
+                                         arith::Analyzer* analyzer) const {
   ICHECK_EQ(indices.size(), initial_indices.size());
 
-  arith::Analyzer analyzer;
+  Map<Var, PrimExpr> vmap;
 
   for (size_t i = 0; i < initial_indices.size(); i++) {
-    analyzer.Bind(initial_indices[i], indices[i]);
+    vmap.Set(initial_indices[i], indices[i]);
   }
 
-  Array<PrimExpr> output;
-  for (const auto& output_dim : final_indices) {
-    output.push_back(analyzer.Simplify(output_dim));
+  arith::Analyzer local_analyzer;
+  if (!analyzer) {
+    analyzer = &local_analyzer;
   }
 
+  Array<PrimExpr> output = final_indices;
+  output.MutateByApply(
+      [&](const PrimExpr& index) { return analyzer->Simplify(Substitute(index, vmap)); });
+
   return output;
 }
 
-Array<Range> IndexMapNode::MapRanges(const Array<Range>& ranges) const {
+Array<Range> IndexMapNode::MapRanges(const Array<Range>& ranges, arith::Analyzer* analyzer) const {
   ICHECK_EQ(ranges.size(), initial_indices.size());
 
   Map<Var, Range> input_iters;
@@ -189,25 +194,30 @@ Array<Range> IndexMapNode::MapRanges(const Array<Range>& ranges) const {
     dom_map[initial_indices[i].get()] = arith::IntSet::FromRange(ranges[i]);
   }
 
+  arith::Analyzer local_analyzer;
+  if (!analyzer) {
+    analyzer = &local_analyzer;
+  }
+
   Array<Range> output;
-  arith::Analyzer analyzer;
   for (const auto& final_index : final_indices) {
     auto int_set = arith::EvalSet(final_index, dom_map);
-    output.push_back(Range::FromMinExtent(analyzer.Simplify(int_set.min()),
-                                          analyzer.Simplify(int_set.max() - int_set.min() + 1)));
+    output.push_back(Range::FromMinExtent(analyzer->Simplify(int_set.min()),
+                                          analyzer->Simplify(int_set.max() - int_set.min() + 1)));
   }
 
   return output;
 }
 
-Array<PrimExpr> IndexMapNode::MapShape(const Array<PrimExpr>& shape) const {
+Array<PrimExpr> IndexMapNode::MapShape(const Array<PrimExpr>& shape,
+                                       arith::Analyzer* analyzer) const {
   ICHECK_EQ(shape.size(), initial_indices.size());
 
   Array<Range> ranges;
   for (auto& dim : shape) {
     ranges.push_back(Range(0, dim));
   }
-  Array<Range> mapped = MapRanges(std::move(ranges));
+  Array<Range> mapped = MapRanges(std::move(ranges), analyzer);
 
   Array<PrimExpr> output;
   for (auto& range : mapped) {
@@ -265,8 +275,12 @@ TVM_REGISTER_GLOBAL("tir.IndexMap")
       return IndexMap(initial_indices, final_indices);
     });
 
-TVM_REGISTER_GLOBAL("tir.IndexMapMapIndices").set_body_method<IndexMap>(&IndexMapNode::MapIndices);
-TVM_REGISTER_GLOBAL("tir.IndexMapMapShape").set_body_method<IndexMap>(&IndexMapNode::MapShape);
+TVM_REGISTER_GLOBAL("tir.IndexMapMapIndices")
+    .set_body_typed([](IndexMap map, Array<PrimExpr> indices) { return map->MapIndices(indices); });
+
+TVM_REGISTER_GLOBAL("tir.IndexMapMapShape").set_body_typed([](IndexMap map, Array<PrimExpr> shape) {
+  return map->MapShape(shape);
+});
 TVM_REGISTER_GLOBAL("tir.IndexMapInverse").set_body_method(&IndexMap::Inverse);
 
 TVM_REGISTER_GLOBAL("tir.IndexMapNonSurjectiveInverse")
diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc
index 87e09505f5024..fb63b1b289b12 100644
--- a/src/tir/schedule/primitive/layout_transformation.cc
+++ b/src/tir/schedule/primitive/layout_transformation.cc
@@ -16,12 +16,13 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+#include "../../../arith/ir_mutator_with_analyzer.h"
 #include "../utils.h"
 
 namespace tvm {
 namespace tir {
 
-class TransformLayoutRewriter : private StmtExprMutator {
+class TransformLayoutRewriter : private arith::IRMutatorWithAnalyzer {
  public:
   /*!
    * \brief Rewrite the access to the buffer after the transformation
@@ -36,27 +37,32 @@ class TransformLayoutRewriter : private StmtExprMutator {
                                                     const Buffer& old_buffer,
                                                     const Buffer& new_buffer,
                                                     const IndexMap& index_map) {
-    TransformLayoutRewriter rewriter(old_buffer, new_buffer, index_map);
+    arith::Analyzer analyzer;
+    TransformLayoutRewriter rewriter(old_buffer, new_buffer, index_map, &analyzer);
     Stmt result = rewriter(scope_stmt);
     return {result, rewriter.block_sref_reuse_};
   }
 
  private:
   TransformLayoutRewriter(const Buffer& old_buffer, const Buffer& new_buffer,
-                          const IndexMap& index_map)
-      : old_buffer_(old_buffer),
+                          const IndexMap& index_map, arith::Analyzer* analyzer)
+      : IRMutatorWithAnalyzer(analyzer),
+        old_buffer_(old_buffer),
         new_buffer_(new_buffer),
         index_map_(index_map),
         buffer_data_to_buffer_{{new_buffer->data, new_buffer}} {}
 
   void RewriteBufferAccess(Buffer* buffer, Array<PrimExpr>* indices) {
     *buffer = new_buffer_;
-    *indices = index_map_->MapIndices(*indices);
-    (*indices).MutateByApply([this](const PrimExpr& index) { return analyzer_.Simplify(index); });
+    *indices = index_map_->MapIndices(*indices, analyzer_);
   }
 
+  using Parent = arith::IRMutatorWithAnalyzer;
+  using Parent::VisitExpr_;
+  using Parent::VisitStmt_;
+
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
-    BufferLoad buffer_load = Downcast<BufferLoad>(StmtExprMutator::VisitExpr_(op));
+    BufferLoad buffer_load = Downcast<BufferLoad>(Parent::VisitExpr_(op));
     if (buffer_load->buffer.same_as(old_buffer_)) {
       auto* n = buffer_load.CopyOnWrite();
       RewriteBufferAccess(&n->buffer, &n->indices);
@@ -65,7 +71,7 @@ class TransformLayoutRewriter : private StmtExprMutator {
   }
 
   Stmt VisitStmt_(const BufferStoreNode* op) final {
-    BufferStore buffer_store = Downcast<BufferStore>(StmtExprMutator::VisitStmt_(op));
+    BufferStore buffer_store = Downcast<BufferStore>(Parent::VisitStmt_(op));
     if (buffer_store->buffer.same_as(old_buffer_)) {
       auto* n = buffer_store.CopyOnWrite();
       RewriteBufferAccess(&n->buffer, &n->indices);
@@ -86,10 +92,7 @@ class TransformLayoutRewriter : private StmtExprMutator {
   }
 
   Stmt VisitStmt_(const BlockNode* op) final {
-    for (const auto& iter_var : op->iter_vars) {
-      analyzer_.Bind(iter_var->var, iter_var->dom);
-    }
-    Block block = Downcast<Block>(StmtExprMutator::VisitStmt_(op));
+    Block block = Downcast<Block>(Parent::VisitStmt_(op));
     auto infered_access_regions = GetBlockReadWriteRegion(block, buffer_data_to_buffer_);
     auto* n = block.CopyOnWrite();
     RewriteAccessRegion(&n->reads, infered_access_regions[0]);
@@ -101,7 +104,6 @@ class TransformLayoutRewriter : private StmtExprMutator {
   const Buffer& old_buffer_;
   const Buffer& new_buffer_;
   const IndexMap& index_map_;
-  arith::Analyzer analyzer_;
   Map<Var, Buffer> buffer_data_to_buffer_;
   Map<Block, Block> block_sref_reuse_;
 };

From 95509eed2650d58463c7b1d89c969bd17770864f Mon Sep 17 00:00:00 2001
From: ah cheng <darkvan_wen@hotmail.com>
Date: Thu, 19 May 2022 01:10:00 +0800
Subject: [PATCH 21/59] fix matmul broadcast (#11242)

---
 python/tvm/relay/frontend/onnx.py          | 38 +++++++++++++++-------
 tests/python/frontend/onnx/test_forward.py |  1 +
 2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index e68daca4c4f0f..1294852ba1971 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -259,23 +259,39 @@ def flatten_to_nd(x, x_shape, nd=3):
             return out
 
         # Determine the output batch dimension.
+        new_a_shape = a_shape
+        new_b_shape = b_shape
         if a_rank > b_rank:
-            out_batch = _op.strided_slice(a_shape, [0], [a_rank - 2])
+            rank_diff = a_rank - b_rank
+            new_b_shape = _op.concatenate(
+                [
+                    _expr.const([1] * rank_diff, dtype=infer_type(b_shape).checked_type.dtype),
+                    b_shape,
+                ],
+                0,
+            )
         elif a_rank < b_rank:
-            out_batch = _op.strided_slice(b_shape, [0], [b_rank - 2])
-        # If its unclear how broadcasting should be applied, the output
-        # shape is determined by choosing the maximum value from each input.
-        else:
-            out_batch = _op.concatenate(
+            rank_diff = b_rank - a_rank
+            new_a_shape = _op.concatenate(
                 [
-                    _op.maximum(
-                        _op.strided_slice(a_shape, [i], [i + 1]),
-                        _op.strided_slice(b_shape, [i], [i + 1]),
-                    )
-                    for i in range(a_rank - 2)
+                    _expr.const([1] * rank_diff, dtype=infer_type(a_shape).checked_type.dtype),
+                    a_shape,
                 ],
                 0,
             )
+        else:
+            pass
+
+        out_batch = _op.concatenate(
+            [
+                _op.maximum(
+                    _op.strided_slice(new_b_shape, [i], [i + 1]),
+                    _op.strided_slice(new_a_shape, [i], [i + 1]),
+                )
+                for i in range(max(a_rank, b_rank) - 2)
+            ],
+            0,
+        )
 
         b_type = infer_type(inputs[1])
         # Convert to dense if the second matrix is 2d and non-dynamic
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 643dfe820b916..6fac7f2f20aa6 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -1286,6 +1286,7 @@ def verify_batch_matmul(a_shape, b_shape, out_shape, convert_config=None):
     verify_batch_matmul((4, 32, 16), (16, 32), (4, 32, 32))
     verify_batch_matmul((4, 32, 16, 32), (32, 16), (4, 32, 16, 16))
     verify_batch_matmul((4, 32, 16, 32), (1, 32, 32, 16), (4, 32, 16, 16))
+    verify_batch_matmul((4, 1, 16, 32), (1, 32, 32, 16), (4, 32, 16, 16))
     # Test transb=False
     verify_batch_matmul(
         (2, 3, 4, 3),

From f34bd22ddc4e7064eabe9fac42c4c04f54ede399 Mon Sep 17 00:00:00 2001
From: A1245967 <a1245967@gmail.com>
Date: Thu, 19 May 2022 02:58:35 +0800
Subject: [PATCH 22/59] Fix function number datatype from char to uint16_t
 (#10014)

rewrite the modified part to pass lint check

Use 2 bytes for func num in fun_registry

Fix errors in linter

Add the declaration of the helper functions

set 2 bytes for func num in func_registry test units

pass num_func by value

This commit change the datatype of the number of the function from 1 Byte to 2 Bytes.
Besides, I use some helper functions to access the number of function and the first function name.
---
 include/tvm/runtime/crt/func_registry.h | 27 ++++++++++++++++-
 src/runtime/crt/common/func_registry.c  | 39 ++++++++++++++++++-------
 src/target/func_registry_generator.cc   |  8 ++++-
 tests/crt/func_registry_test.cc         |  7 +++--
 4 files changed, 66 insertions(+), 15 deletions(-)

diff --git a/include/tvm/runtime/crt/func_registry.h b/include/tvm/runtime/crt/func_registry.h
index 4f8a19af591e8..50737f8717987 100644
--- a/include/tvm/runtime/crt/func_registry.h
+++ b/include/tvm/runtime/crt/func_registry.h
@@ -42,7 +42,7 @@ typedef struct TVMFuncRegistry {
   /*! \brief Names of registered functions, concatenated together and separated by \0.
    * An additional \0 is present at the end of the concatenated blob to mark the end.
    *
-   * Byte 0 is the number of functions in `funcs`.
+   * Byte 0 and 1 are the number of functions in `funcs`.
    */
   const char* names;
 
@@ -50,6 +50,31 @@ typedef struct TVMFuncRegistry {
   const TVMBackendPackedCFunc* funcs;
 } TVMFuncRegistry;
 
+/*!
+ * \brief Get the of the number of functions from registry.
+ *
+ * \param reg TVMFunctionRegistry instance that contains the function.
+ * \return The number of functions from registry.
+ */
+uint16_t TVMFuncRegistry_GetNumFuncs(const TVMFuncRegistry* reg);
+
+/*!
+ * \brief Set the number of functions to registry.
+ *
+ * \param reg TVMFunctionRegistry instance that contains the function.
+ * \param num_funcs The number of functions
+ * \return 0 when successful.
+ */
+int TVMFuncRegistry_SetNumFuncs(const TVMFuncRegistry* reg, const uint16_t num_funcs);
+
+/*!
+ * \brief Get the address of 0th function from registry.
+ *
+ * \param reg TVMFunctionRegistry instance that contains the function.
+ * \return the address of 0th function from registry
+ */
+const char* TVMFuncRegistry_Get0thFunctionName(const TVMFuncRegistry* reg);
+
 /*!
  * \brief Get packed function from registry by name.
  *
diff --git a/src/runtime/crt/common/func_registry.c b/src/runtime/crt/common/func_registry.c
index 116a5c496f1bd..49cef8fd70eb3 100644
--- a/src/runtime/crt/common/func_registry.c
+++ b/src/runtime/crt/common/func_registry.c
@@ -60,14 +60,29 @@ int strcmp_cursor(const char** cursor, const char* name) {
   return return_value;
 }
 
+uint16_t TVMFuncRegistry_GetNumFuncs(const TVMFuncRegistry* reg) {
+  uint16_t num_funcs;
+  memcpy(&num_funcs, reg->names, sizeof(num_funcs));
+  return num_funcs;
+}
+
+int TVMFuncRegistry_SetNumFuncs(const TVMFuncRegistry* reg, const uint16_t num_funcs) {
+  memcpy((char*)reg->names, &num_funcs, sizeof(num_funcs));
+  return 0;
+}
+
+const char* TVMFuncRegistry_Get0thFunctionName(const TVMFuncRegistry* reg) {
+  // NOTE: first function name starts at index 2 to skip num_funcs.
+  return (reg->names + sizeof(uint16_t));
+}
+
 tvm_crt_error_t TVMFuncRegistry_Lookup(const TVMFuncRegistry* reg, const char* name,
                                        tvm_function_index_t* function_index) {
   tvm_function_index_t idx;
-  const char* reg_name_ptr;
+  const char* reg_name_ptr = TVMFuncRegistry_Get0thFunctionName(reg);
 
   idx = 0;
-  // NOTE: reg_name_ptr starts at index 1 to skip num_funcs.
-  for (reg_name_ptr = reg->names + 1; *reg_name_ptr != '\0'; reg_name_ptr++) {
+  for (; *reg_name_ptr != '\0'; reg_name_ptr++) {
     if (!strcmp_cursor(&reg_name_ptr, name)) {
       *function_index = idx;
       return kTvmErrorNoError;
@@ -82,9 +97,9 @@ tvm_crt_error_t TVMFuncRegistry_Lookup(const TVMFuncRegistry* reg, const char* n
 tvm_crt_error_t TVMFuncRegistry_GetByIndex(const TVMFuncRegistry* reg,
                                            tvm_function_index_t function_index,
                                            TVMBackendPackedCFunc* out_func) {
-  uint8_t num_funcs;
+  uint16_t num_funcs;
 
-  num_funcs = reg->names[0];
+  num_funcs = TVMFuncRegistry_GetNumFuncs(reg);
   if (function_index >= num_funcs) {
     return kTvmErrorFunctionIndexInvalid;
   }
@@ -101,7 +116,8 @@ tvm_crt_error_t TVMMutableFuncRegistry_Create(TVMMutableFuncRegistry* reg, uint8
 
   reg->registry.names = (const char*)buffer;
   buffer[0] = 0;  // number of functions present in buffer.
-  buffer[1] = 0;  // end of names list marker.
+  buffer[1] = 0;  // note that we combine the first two elements to form a 16-bit function index.
+  buffer[2] = 0;  // end of names list marker.
 
   // compute a guess of the average size of one entry:
   //  - assume average function name is around ~10 bytes
@@ -117,13 +133,12 @@ tvm_crt_error_t TVMMutableFuncRegistry_Create(TVMMutableFuncRegistry* reg, uint8
 tvm_crt_error_t TVMMutableFuncRegistry_Set(TVMMutableFuncRegistry* reg, const char* name,
                                            TVMBackendPackedCFunc func, int override) {
   size_t idx;
-  char* reg_name_ptr;
+  char* reg_name_ptr = (char*)TVMFuncRegistry_Get0thFunctionName(&(reg->registry));
 
   idx = 0;
   // NOTE: safe to discard const qualifier here, since reg->registry.names was set from
   // TVMMutableFuncRegistry_Create above.
-  // NOTE: reg_name_ptr starts at index 1 to skip num_funcs.
-  for (reg_name_ptr = (char*)reg->registry.names + 1; *reg_name_ptr != 0; reg_name_ptr++) {
+  for (; *reg_name_ptr != 0; reg_name_ptr++) {
     if (!strcmp_cursor((const char**)&reg_name_ptr, name)) {
       if (override == 0) {
         return kTvmErrorFunctionAlreadyDefined;
@@ -149,7 +164,11 @@ tvm_crt_error_t TVMMutableFuncRegistry_Set(TVMMutableFuncRegistry* reg, const ch
   reg_name_ptr += name_len + 1;
   *reg_name_ptr = 0;
   ((TVMBackendPackedCFunc*)reg->registry.funcs)[idx] = func;
-  ((char*)reg->registry.names)[0]++;  // increment num_funcs.
+
+  uint16_t num_funcs;
+  // increment num_funcs.
+  num_funcs = TVMFuncRegistry_GetNumFuncs(&(reg->registry)) + 1;
+  TVMFuncRegistry_SetNumFuncs(&(reg->registry), num_funcs);
 
   return kTvmErrorNoError;
 }
diff --git a/src/target/func_registry_generator.cc b/src/target/func_registry_generator.cc
index 7c948d50cbb94..d679bf379b628 100644
--- a/src/target/func_registry_generator.cc
+++ b/src/target/func_registry_generator.cc
@@ -31,7 +31,13 @@ namespace target {
 
 std::string GenerateFuncRegistryNames(const Array<String>& function_names) {
   std::stringstream ss;
-  ss << (unsigned char)(function_names.size());
+
+  unsigned char function_nums[sizeof(uint16_t)];
+  *reinterpret_cast<uint16_t*>(function_nums) = function_names.size();
+  for (auto f : function_nums) {
+    ss << f;
+  }
+
   for (auto f : function_names) {
     ss << f << '\0';
   }
diff --git a/tests/crt/func_registry_test.cc b/tests/crt/func_registry_test.cc
index 9f0e7f8d1a5aa..5962a3acee39f 100644
--- a/tests/crt/func_registry_test.cc
+++ b/tests/crt/func_registry_test.cc
@@ -82,7 +82,7 @@ TEST(StrCmpScan, Test) {
 }
 
 TEST(FuncRegistry, Empty) {
-  TVMFuncRegistry registry{"\000", NULL};
+  TVMFuncRegistry registry{"\000\000", NULL};
 
   EXPECT_EQ(kTvmErrorFunctionNameNotFound, TVMFuncRegistry_Lookup(&registry, "foo", NULL));
   EXPECT_EQ(kTvmErrorFunctionIndexInvalid,
@@ -101,7 +101,7 @@ static int Bar(TVMValue* args, int* type_codes, int num_args, TVMValue* out_ret_
 }
 
 // Matches the style of registry defined in generated C modules.
-const char* kBasicFuncNames = "\002Foo\0Bar\0";  // NOTE: final \0
+const char* kBasicFuncNames = "\002\000Foo\0Bar\0";  // NOTE: final \0
 const TVMBackendPackedCFunc funcs[2] = {&Foo, &Bar};
 const TVMFuncRegistry kConstRegistry = {kBasicFuncNames, (const TVMBackendPackedCFunc*)funcs};
 
@@ -111,7 +111,8 @@ TEST(FuncRegistry, ConstGlobalRegistry) {
 
   // Foo
   EXPECT_EQ(kBasicFuncNames[0], 2);
-  EXPECT_EQ(kBasicFuncNames[1], 'F');
+  EXPECT_EQ(kBasicFuncNames[1], 0);
+  EXPECT_EQ(kBasicFuncNames[2], 'F');
   EXPECT_EQ(kTvmErrorNoError, TVMFuncRegistry_Lookup(&kConstRegistry, "Foo", &func_index));
   EXPECT_EQ(0, func_index);
 

From dd3262fa0438182944794f87ee7dbe8768c89269 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 18 May 2022 12:17:46 -0700
Subject: [PATCH 23/59] [ci][docker] Conditionally link sccache to clang
 (#11316)

This was causing errors with #11314 since it was making it appear as if `clang` was available when it was only the sccache wrapper.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 docker/install/ubuntu_install_sccache.sh | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/docker/install/ubuntu_install_sccache.sh b/docker/install/ubuntu_install_sccache.sh
index dff7d977860b4..5ef78643a741f 100755
--- a/docker/install/ubuntu_install_sccache.sh
+++ b/docker/install/ubuntu_install_sccache.sh
@@ -26,8 +26,14 @@ cargo install sccache
 mkdir /opt/sccache
 ln "$(which sccache)" /opt/sccache/cc
 ln "$(which sccache)" /opt/sccache/c++
-ln "$(which sccache)" /opt/sccache/clang
-ln "$(which sccache)" /opt/sccache/clang++
+
+# Only add clang if it's on the PATH
+if command -v clang &> /dev/null
+then
+    ln "$(which sccache)" /opt/sccache/clang
+    ln "$(which sccache)" /opt/sccache/clang++
+fi
+
 
 # make rust usable by all users after install during container build
 chmod -R a+rw /opt/rust

From 3fbd9b66b745eb59021c265a9708b6ac08f700d0 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 18 May 2022 14:19:43 -0500
Subject: [PATCH 24/59] [CI] Added message if test is running on another shard
 (#11331)

---
 conftest.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/conftest.py b/conftest.py
index 9768b6cc528dd..3c04f0680a113 100644
--- a/conftest.py
+++ b/conftest.py
@@ -58,9 +58,9 @@
 }
 
 
-def should_run(nodeid: str, num_shards: int, shard_index: int) -> bool:
+def find_shard_index(nodeid: str, num_shards: int) -> int:
     """
-    Return true if this test should run on this shard
+    Return the index of the shard that should run this test
     """
     for prefix, target_shard_idx in FIXED_ALLOCATION_PREFIXES.items():
         if nodeid.startswith(prefix):
@@ -68,7 +68,7 @@ def should_run(nodeid: str, num_shards: int, shard_index: int) -> bool:
                 raise RuntimeError(
                     f"Cannot collect sharded tests, {nodeid} has hardcoded shard index {target_shard_idx} among only {num_shards} shards"
                 )
-            return target_shard_idx == shard_index
+            return target_shard_idx
 
     if nodeid in HARDCODED_ALLOCATIONS:
         hash = HARDCODED_ALLOCATIONS[nodeid]
@@ -76,7 +76,7 @@ def should_run(nodeid: str, num_shards: int, shard_index: int) -> bool:
         hash = hashlib.md5(nodeid.encode())
         hash = int(hash.hexdigest(), 16)
 
-    return hash % num_shards == shard_index
+    return hash % num_shards
 
 
 def pytest_collection_modifyitems(config, items):
@@ -89,5 +89,10 @@ def pytest_collection_modifyitems(config, items):
 
     print(f"Marking tests for shard {shard_index} of {num_shards}")
     for item in items:
-        if not should_run(item.nodeid, num_shards=num_shards, shard_index=shard_index):
-            item.add_marker(pytest.mark.skip())
+        item_shard_index = find_shard_index(item.nodeid, num_shards=num_shards)
+        item.add_marker(
+            pytest.mark.skipif(
+                item_shard_index != shard_index,
+                reason=f"Test running on shard {item_shard_index} of {num_shards}",
+            )
+        )

From fb0938a5410ad91594cb4b56fdb5a84845197cb0 Mon Sep 17 00:00:00 2001
From: Youlei Yang <youlei.yang@intel.com>
Date: Thu, 19 May 2022 03:23:42 +0800
Subject: [PATCH 25/59] [CI] update oneDNN to v2.6 (#11140)

* enable CI to get and build latest oneDNN release

* remove the source code after installed

* fix wget error and improve naming

* refine the cmake/make commands

Co-authored-by: driazati <9407960+driazati@users.noreply.github.com>

* pinned to v2.6 by default

* simplify the logic and install to /usr/lib

Co-authored-by: driazati <9407960+driazati@users.noreply.github.com>
---
 docker/install/ubuntu_install_dnnl.sh | 33 +++++++++++++++++++++------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/docker/install/ubuntu_install_dnnl.sh b/docker/install/ubuntu_install_dnnl.sh
index 34f917ce6606a..3654d140f55b4 100755
--- a/docker/install/ubuntu_install_dnnl.sh
+++ b/docker/install/ubuntu_install_dnnl.sh
@@ -20,10 +20,29 @@ set -e
 set -u
 set -o pipefail
 
-cd /usr/local/
-wget -q https://github.com/oneapi-src/oneDNN/releases/download/v2.2/dnnl_lnx_2.2.0_cpu_gomp.tgz
-tar -xzf dnnl_lnx_2.2.0_cpu_gomp.tgz
-mv dnnl_lnx_2.2.0_cpu_gomp/include/* /usr/local/include/
-mv dnnl_lnx_2.2.0_cpu_gomp/lib/libdnnl* /usr/local/lib/
-rm -rf dnnl_lnx_2.2.0_cpu_gomp.tgz dnnl_lnx_2.2.0_cpu_gomp
-cd -
+pre_dir=`pwd`
+tmpdir=$(mktemp -d)
+
+rls_tag="v2.6"
+
+dnnl_ver=`echo ${rls_tag} | sed 's/v//g'`
+echo "Using oneDNN release version ${dnnl_ver} with tag '${rls_tag}'"
+
+archive_name="${rls_tag}.tar.gz"
+archive_url="https://github.com/oneapi-src/oneDNN/archive/refs/tags/${archive_name}"
+archive_folder="${tmpdir}/oneDNN-${dnnl_ver}"
+archive_hash="4cb7b80bfe16920bc096e18e7d8caa56b9ab7a4dab2a091a230bcf562c09533392f4a4ccd4db22754a10293670efdea20382db0994dc47949005a4c77f14b64c"
+
+cd "${tmpdir}"
+
+curl -sL "${archive_url}" -o "${archive_name}"
+echo "$archive_hash" ${archive_name} | sha512sum -c
+tar xf "${archive_name}"
+
+cd "${archive_folder}"
+cmake . -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_INSTALL_LIBDIR=lib
+make -j"$(nproc)"
+make install
+
+cd ${pre_dir}
+rm -rf "${tmpdir}"

From 89a439ed4c8c392f0f144bef325aed64889e91a4 Mon Sep 17 00:00:00 2001
From: Adam Straw <astraw@octoml.ai>
Date: Wed, 18 May 2022 13:05:23 -0700
Subject: [PATCH 26/59] [Hexagon] Add unit tests for Hexagon Device API
 (#11319)

* [Hexagon] Add unit tests for Hexagon Device API
* add scalar alloc for Hexagon + cleanup
---
 docker/Dockerfile.ci_hexagon                  |   1 -
 src/runtime/hexagon/hexagon_device_api.cc     |  34 ++--
 src/runtime/hexagon/hexagon_device_api.h      |  10 ++
 .../hexagon/hexagon_device_api_tests.cc       | 148 ++++++++++++++++++
 .../test_hexagon/test_run_unit_tests.py       |   6 +-
 tests/scripts/task_build_hexagon_api.sh       |   5 +-
 tests/scripts/task_python_hexagon.sh          |   3 -
 7 files changed, 177 insertions(+), 30 deletions(-)
 create mode 100644 tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc

diff --git a/docker/Dockerfile.ci_hexagon b/docker/Dockerfile.ci_hexagon
index 20b185ab64560..ddca5c6c2e666 100644
--- a/docker/Dockerfile.ci_hexagon
+++ b/docker/Dockerfile.ci_hexagon
@@ -63,7 +63,6 @@ ENV CLANG_LLVM_HOME /opt/clang-llvm
 ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/opt/clang-llvm/lib
 ENV PATH /opt/clang-llvm/bin:$PATH
 ENV HEXAGON_TOOLCHAIN "${HEXAGON_SDK_ROOT}/tools/HEXAGON_Tools/8.5.08/Tools"
-ENV HEXAGON_GTEST "${HEXAGON_SDK_ROOT}/utils/googletest/gtest"
 
 # sccache
 COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index db3ef3faa4f76..c9c1586008e3a 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -55,10 +55,15 @@ void HexagonDeviceAPI::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv)
 // DataSpace: static allocations for Hexagon
 void* HexagonDeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
                                        Optional<String> mem_scope) {
+  CHECK(shape) << "shape array is null";
+  CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type;
+
   if (!mem_scope.defined() || mem_scope.value() == "global") {
     return DeviceAPI::AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
   }
 
+  // must be Hexagon device and VTCM scope after this point
+  CHECK_EQ(mem_scope.value(), "global.vtcm");
   CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type;
 
   size_t typesize = (dtype.bits / 8) * dtype.lanes;
@@ -68,7 +73,9 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shap
     alignment = kHexagonAllocAlignment;
   }
 
-  if (ndim == 1) {
+  if (ndim == 0) {
+    return AllocateHexagonBuffer(typesize, alignment, mem_scope);
+  } else if (ndim == 1) {
     size_t nbytes = shape[0] * typesize;
     return AllocateHexagonBuffer(nbytes, alignment, mem_scope);
   } else if (ndim == 2) {
@@ -84,10 +91,9 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shap
 
 void* HexagonDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignment,
                                        DLDataType type_hint) {
-  // Added kDLCPU since we use hexagon as a sub-target of LLVM which by default maps to kDLCPU;
-  bool is_valid_device = (TVMDeviceExtType(dev.device_type) == kDLHexagon) ||
-                         (DLDeviceType(dev.device_type) == kDLCPU);
-  CHECK(is_valid_device) << "dev.device_type: " << dev.device_type;
+  CHECK(nbytes) << "number of bytes is zero";
+  CHECK(alignment) << "alignment is zero";
+  CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type;
   if (alignment < kHexagonAllocAlignment) {
     alignment = kHexagonAllocAlignment;
   }
@@ -95,10 +101,8 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignme
 }
 
 void HexagonDeviceAPI::FreeDataSpace(Device dev, void* ptr) {
-  // Added kDLCPU since we use hexagon as a sub-target of LLVM which by default maps to kDLCPU;
-  bool is_valid_device = (TVMDeviceExtType(dev.device_type) == kDLHexagon) ||
-                         (DLDeviceType(dev.device_type) == kDLCPU);
-  CHECK(is_valid_device) << "dev.device_type: " << dev.device_type;
+  CHECK(ptr) << "buffer pointer is null";
+  CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type;
   FreeHexagonBuffer(ptr);
 }
 
@@ -109,18 +113,12 @@ struct HexagonWorkspacePool : public WorkspacePool {
 };
 
 void* HexagonDeviceAPI::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) {
-  // Added kDLCPU since we use hexagon as a sub-target of LLVM which by default maps to kDLCPU;
-  bool is_valid_device = (TVMDeviceExtType(dev.device_type) == kDLHexagon) ||
-                         (DLDeviceType(dev.device_type) == kDLCPU);
-  CHECK(is_valid_device) << "dev.device_type: " << dev.device_type;
+  CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type;
   return dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->AllocWorkspace(dev, size);
 }
 
 void HexagonDeviceAPI::FreeWorkspace(Device dev, void* data) {
-  // Added kDLCPU since we use hexagon as a sub-target of LLVM which by default maps to kDLCPU;
-  bool is_valid_device = (TVMDeviceExtType(dev.device_type) == kDLHexagon) ||
-                         (DLDeviceType(dev.device_type) == kDLCPU);
-  CHECK(is_valid_device) << "dev.device_type: " << dev.device_type;
+  CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type;
   CHECK(hexagon_buffer_map_.count(data) != 0)
       << "Attempt made to free unknown or already freed workspace allocation";
   dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->FreeWorkspace(dev, data);
@@ -128,12 +126,14 @@ void HexagonDeviceAPI::FreeWorkspace(Device dev, void* data) {
 
 void* HexagonDeviceAPI::AllocVtcmWorkspace(Device dev, int ndim, const int64_t* shape,
                                            DLDataType dtype, Optional<String> mem_scope) {
+  // must be Hexagon device (not CPU)
   CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type;
   CHECK((ndim == 1 || ndim == 2) && "Hexagon Device API supports only 1d and 2d allocations");
   return AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
 }
 
 void HexagonDeviceAPI::FreeVtcmWorkspace(Device dev, void* ptr) {
+  // must be Hexagon device (not CPU)
   CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type;
   FreeDataSpace(dev, ptr);
 }
diff --git a/src/runtime/hexagon/hexagon_device_api.h b/src/runtime/hexagon/hexagon_device_api.h
index cc71adfb77946..6f65bf402757f 100644
--- a/src/runtime/hexagon/hexagon_device_api.h
+++ b/src/runtime/hexagon/hexagon_device_api.h
@@ -138,6 +138,16 @@ class HexagonDeviceAPI final : public DeviceAPI {
     hexagon_buffer_map_.insert({ptr, std::move(buf)});
     return ptr;
   }
+
+  /*! \brief Helper to check if the device type is valid for the Hexagon Device API
+   *  \return Boolean indicating whether the device type is valid
+   */
+  bool IsValidDevice(DLDevice dev) {
+    // Added kDLCPU since we use hexagon as a sub-target of LLVM which by default maps to kDLCPU
+    return (TVMDeviceExtType(dev.device_type) == kDLHexagon) ||
+           (DLDeviceType(dev.device_type) == kDLCPU);
+  }
+
   /*! \brief Helper to free a HexagonBuffer and unregister the result
    *  from the owned buffer map.
    */
diff --git a/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc b/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc
new file mode 100644
index 0000000000000..fbcee37cb1541
--- /dev/null
+++ b/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../src/runtime/hexagon/hexagon_device_api.h"
+
+using namespace tvm::runtime;
+using namespace tvm::runtime::hexagon;
+
+class HexagonDeviceAPITest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    hexapi = HexagonDeviceAPI::Global();
+    cpu_dev.device_type = DLDeviceType(kDLCPU);
+    hex_dev.device_type = DLDeviceType(kDLHexagon);
+    invalid_dev.device_type = DLDeviceType(kDLExtDev);
+    int8.bits = 8;
+    int8.code = 0;
+    int8.lanes = 1;
+  }
+  DLDevice cpu_dev;
+  DLDevice hex_dev;
+  DLDevice invalid_dev;
+  DLDataType int8;
+  HexagonDeviceAPI* hexapi;
+  size_t nbytes{256};
+  size_t alignment{64};
+  int64_t shape1d[1]{256};
+  int64_t shape2d[2]{256, 256};
+  int64_t shape3d[3]{256, 256, 256};
+  Optional<String> default_scope;
+  Optional<String> invalid_scope{"invalid"};
+  Optional<String> global_scope{"global"};
+  Optional<String> global_vtcm_scope{"global.vtcm"};
+};
+
+TEST_F(HexagonDeviceAPITest, global) { CHECK(hexapi != nullptr); }
+
+TEST_F(HexagonDeviceAPITest, alloc_free_cpu) {
+  void* buf = hexapi->AllocDataSpace(cpu_dev, nbytes, alignment, int8);
+  CHECK(buf != nullptr);
+  hexapi->FreeDataSpace(cpu_dev, buf);
+}
+
+TEST_F(HexagonDeviceAPITest, alloc_free_hex) {
+  void* buf = hexapi->AllocDataSpace(hex_dev, nbytes, alignment, int8);
+  CHECK(buf != nullptr);
+  hexapi->FreeDataSpace(hex_dev, buf);
+}
+
+TEST_F(HexagonDeviceAPITest, alloc_errors) {
+  // invalid device
+  EXPECT_THROW(hexapi->AllocDataSpace(invalid_dev, nbytes, alignment, int8), InternalError);
+  // 0 size
+  EXPECT_THROW(hexapi->AllocDataSpace(hex_dev, 0, alignment, int8), InternalError);
+  // 0 alignment
+  EXPECT_THROW(hexapi->AllocDataSpace(hex_dev, nbytes, 0, int8), InternalError);
+}
+
+TEST_F(HexagonDeviceAPITest, free_errors) {
+  void* buf = hexapi->AllocDataSpace(hex_dev, nbytes, alignment, int8);
+
+  // invalid device
+  EXPECT_THROW(hexapi->FreeDataSpace(invalid_dev, buf), InternalError);
+  // invalid pointer
+  EXPECT_THROW(hexapi->FreeDataSpace(hex_dev, &buf), InternalError);
+  // nullptr
+  EXPECT_THROW(hexapi->FreeDataSpace(hex_dev, nullptr), InternalError);
+  // double free
+  hexapi->FreeDataSpace(hex_dev, buf);
+  EXPECT_THROW(hexapi->FreeDataSpace(hex_dev, buf), InternalError);
+}
+
+TEST_F(HexagonDeviceAPITest, allocnd_free_cpu) {
+  void* buf = hexapi->AllocDataSpace(cpu_dev, 3, shape3d, int8, global_scope);
+  CHECK(buf != nullptr);
+  hexapi->FreeDataSpace(cpu_dev, buf);
+}
+
+TEST_F(HexagonDeviceAPITest, allocnd_free_hex) {
+  void* buf = hexapi->AllocDataSpace(hex_dev, 3, shape3d, int8, global_scope);
+  CHECK(buf != nullptr);
+  hexapi->FreeDataSpace(hex_dev, buf);
+}
+
+TEST_F(HexagonDeviceAPITest, allocnd_free_hex_vtcm) {
+  void* buf1d = hexapi->AllocDataSpace(hex_dev, 1, shape1d, int8, global_vtcm_scope);
+  CHECK(buf1d != nullptr);
+  hexapi->FreeDataSpace(hex_dev, buf1d);
+
+  void* buf2d = hexapi->AllocDataSpace(hex_dev, 2, shape2d, int8, global_vtcm_scope);
+  CHECK(buf2d != nullptr);
+  hexapi->FreeDataSpace(hex_dev, buf2d);
+}
+
+TEST_F(HexagonDeviceAPITest, allocnd_erros) {
+  // invalid device
+  EXPECT_THROW(hexapi->AllocDataSpace(invalid_dev, 2, shape2d, int8, global_vtcm_scope),
+               InternalError);
+
+  // Hexagon VTCM allocations must have 0 (scalar) 1 or 2 dimensions
+  EXPECT_THROW(hexapi->AllocDataSpace(hex_dev, 3, shape3d, int8, global_vtcm_scope), InternalError);
+
+  // null shape
+  EXPECT_THROW(hexapi->AllocDataSpace(hex_dev, 2, nullptr, int8, global_vtcm_scope), InternalError);
+
+  // null shape
+  EXPECT_THROW(hexapi->AllocDataSpace(hex_dev, 2, shape2d, int8, invalid_scope), InternalError);
+
+  // cpu & global.vtcm scope
+  EXPECT_THROW(hexapi->AllocDataSpace(cpu_dev, 2, shape2d, int8, global_vtcm_scope), InternalError);
+}
+
+TEST_F(HexagonDeviceAPITest, alloc_scalar) {
+  void* cpuscalar = hexapi->AllocDataSpace(cpu_dev, 0, new int64_t, int8, global_scope);
+  CHECK(cpuscalar != nullptr);
+
+  void* hexscalar = hexapi->AllocDataSpace(hex_dev, 0, new int64_t, int8, global_vtcm_scope);
+  CHECK(hexscalar != nullptr);
+}
+
+// alloc and free of the same buffer on different devices should throw
+// but it currently works with no error
+// hexagon and cpu device types may merge long term which would make this test case moot
+// disabling this test case, for now
+// TODO(HWE): Re-enable or delete this test case once we land on device type strategy
+TEST_F(HexagonDeviceAPITest, DISABLED_alloc_free_diff_dev) {
+  void* buf = hexapi->AllocDataSpace(hex_dev, nbytes, alignment, int8);
+  CHECK(buf != nullptr);
+  EXPECT_THROW(hexapi->FreeDataSpace(cpu_dev, buf), InternalError);
+}
diff --git a/tests/python/contrib/test_hexagon/test_run_unit_tests.py b/tests/python/contrib/test_hexagon/test_run_unit_tests.py
index 010c79b8f5544..6a60b8fa81b9d 100644
--- a/tests/python/contrib/test_hexagon/test_run_unit_tests.py
+++ b/tests/python/contrib/test_hexagon/test_run_unit_tests.py
@@ -28,16 +28,12 @@
 # for example to run all "foo" tests twice and observe gtest output run
 # pytest -sv <this file> --gtests_args="--gtest_filter=*foo* --gtest_repeat=2"
 @tvm.testing.requires_hexagon
-@pytest.mark.skipif(
-    os.environ.get("HEXAGON_GTEST") == None,
-    reason="Test requires environment variable HEXAGON_GTEST set with a path to a Hexagon gtest version normally located at /path/to/hexagon/sdk/utils/googletest/gtest",
-)
 def test_run_unit_tests(hexagon_session: Session, gtest_args):
     try:
         func = hexagon_session._rpc.get_function("hexagon.run_unit_tests")
     except:
         print(
-            "Test requires TVM Runtime to be built with a Hexagon gtest version using Hexagon API cmake flag -DUSE_HEXAGON_GTEST=${HEXAGON_GTEST}"
+            "This test requires TVM Runtime to be built with a Hexagon gtest version using Hexagon API cmake flag -DUSE_HEXAGON_GTEST=/path/to/hexagon/sdk/utils/googletest/gtest"
         )
         raise
 
diff --git a/tests/scripts/task_build_hexagon_api.sh b/tests/scripts/task_build_hexagon_api.sh
index c5d05eaad80c5..a3b501d9c5546 100755
--- a/tests/scripts/task_build_hexagon_api.sh
+++ b/tests/scripts/task_build_hexagon_api.sh
@@ -37,9 +37,6 @@ cd build
 output_binary_directory=$(realpath ${PWD}/../../../build/hexagon_api_output)
 rm -rf ${output_binary_directory}
 
-# should be removed after Hexagon Docker update
-export HEXAGON_GTEST="${HEXAGON_SDK_PATH}/utils/googletest/gtest"
-
 cmake -DANDROID_ABI=arm64-v8a \
     -DANDROID_PLATFORM=android-28 \
     -DUSE_ANDROID_TOOLCHAIN="${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake" \
@@ -47,6 +44,6 @@ cmake -DANDROID_ABI=arm64-v8a \
     -DUSE_HEXAGON_SDK="${HEXAGON_SDK_PATH}" \
     -DUSE_HEXAGON_TOOLCHAIN="${HEXAGON_TOOLCHAIN}" \
     -DUSE_OUTPUT_BINARY_DIR="${output_binary_directory}" \
-    -DUSE_HEXAGON_GTEST="${HEXAGON_GTEST}" ..
+    -DUSE_HEXAGON_GTEST="${HEXAGON_SDK_PATH}/utils/googletest/gtest" ..
 
 make -j$(nproc)
diff --git a/tests/scripts/task_python_hexagon.sh b/tests/scripts/task_python_hexagon.sh
index b639ac02a695b..274b348f09350 100755
--- a/tests/scripts/task_python_hexagon.sh
+++ b/tests/scripts/task_python_hexagon.sh
@@ -43,9 +43,6 @@ if [[ "${device_serial}" == "simulator" ]]; then
     export HEXAGON_SDK_ROOT=${HEXAGON_SDK_PATH}
 fi
 
-# should be removed after Hexagon Docker update
-export HEXAGON_GTEST="${HEXAGON_SDK_PATH}/utils/googletest/gtest"
-
 export ANDROID_SERIAL_NUMBER=${device_serial}
 run_pytest ctypes python-contrib-hexagon tests/python/contrib/test_hexagon
 

From 9273ea5e49ca05404293cb651ced6d0bc0c0f206 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Wed, 18 May 2022 13:06:11 -0700
Subject: [PATCH 27/59] [Hexagon]Refactor Hexagon_SDK_PATH (#11282)

* refactor HEXAGON_SDK_PATH and remove HEXAGON_GTEST
---
 python/tvm/contrib/hexagon/tools.py         | 16 ++++++++--------
 tests/python/contrib/test_hexagon/README.md |  4 ++--
 tests/scripts/task_build_hexagon_api.sh     |  2 +-
 tests/scripts/task_config_build_hexagon.sh  |  2 +-
 tests/scripts/task_python_hexagon.sh        |  3 ---
 5 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/python/tvm/contrib/hexagon/tools.py b/python/tvm/contrib/hexagon/tools.py
index edf2821d31363..1aec8c7d565b5 100644
--- a/python/tvm/contrib/hexagon/tools.py
+++ b/python/tvm/contrib/hexagon/tools.py
@@ -41,7 +41,7 @@
 # Subsequent calls to 'link_shared' will use the newly registered linker.
 
 HEXAGON_TOOLCHAIN = os.environ.get("HEXAGON_TOOLCHAIN", default="")  # pylint: disable=invalid-name
-HEXAGON_SDK_PATH = os.environ.get("HEXAGON_SDK_PATH", default="")  # pylint: disable=invalid-name
+HEXAGON_SDK_ROOT = os.environ.get("HEXAGON_SDK_ROOT", default="")  # pylint: disable=invalid-name
 HEXAGON_LINK_MAIN = (
     pathlib.Path(HEXAGON_TOOLCHAIN) / "bin" / "hexagon-link"
 )  # pylint: disable=invalid-name
@@ -49,8 +49,8 @@
     pathlib.Path(HEXAGON_TOOLCHAIN) / "bin" / "hexagon-clang++"
 )  # pylint: disable=invalid-name
 HEXAGON_SDK_INCLUDE_DIRS = [  # pylint: disable=invalid-name
-    pathlib.Path(HEXAGON_SDK_PATH) / "incs",
-    pathlib.Path(HEXAGON_SDK_PATH) / "incs" / "stddef",
+    pathlib.Path(HEXAGON_SDK_ROOT) / "incs",
+    pathlib.Path(HEXAGON_SDK_ROOT) / "incs" / "stddef",
 ]
 
 
@@ -154,10 +154,10 @@ def create_aot_shared(so_name: Union[str, pathlib.Path], files, hexagon_arch: st
             " The environment variable HEXAGON_TOOLCHAIN is unset. Please export "
             + "HEXAGON_TOOLCHAIN in your environment."
         )
-    if not HEXAGON_SDK_PATH:
+    if not HEXAGON_SDK_ROOT:
         raise Exception(
-            " The environment variable HEXAGON_SDK_PATH is unset. Please export "
-            + "HEXAGON_SDK_PATH in your environment."
+            " The environment variable HEXAGON_SDK_ROOT is unset. Please export "
+            + "HEXAGON_SDK_ROOT in your environment."
         )
 
     # The AOT C codegen uses TVM runtime functions
@@ -180,8 +180,8 @@ def create_aot_shared(so_name: Union[str, pathlib.Path], files, hexagon_arch: st
         f"-I{tvm_dir / 'include'}",
         f"-I{tvm_dir / '3rdparty' / 'dlpack' / 'include'}",
         f"-I{tvm_dir / '3rdparty' / 'dmlc-core' / 'include'}",
-        f"-I{pathlib.Path(HEXAGON_SDK_PATH) / 'rtos' / 'qurt' / compute_arch / 'include'/ 'posix'}",
-        f"-I{pathlib.Path(HEXAGON_SDK_PATH) / 'rtos' / 'qurt' / compute_arch / 'include' / 'qurt'}",
+        f"-I{pathlib.Path(HEXAGON_SDK_ROOT) / 'rtos' / 'qurt' / compute_arch / 'include'/ 'posix'}",
+        f"-I{pathlib.Path(HEXAGON_SDK_ROOT) / 'rtos' / 'qurt' / compute_arch / 'include' / 'qurt'}",
         f"-DDMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>",
         f"-D_MACH_I32=int",
     ]
diff --git a/tests/python/contrib/test_hexagon/README.md b/tests/python/contrib/test_hexagon/README.md
index ce854bb0ab23b..a2b108f7a4edf 100644
--- a/tests/python/contrib/test_hexagon/README.md
+++ b/tests/python/contrib/test_hexagon/README.md
@@ -33,7 +33,7 @@ First, ensure to export Clang libraries to `LD_LIBRARY_PATH` and Hexagon toolcha
 ```bash
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:"Path to `llvm-clang/lib` sub-directory. Currently we use LLVM-13 in TVM CI."
 
-export HEXAGON_TOOLCHAIN="Path to Hexagon toolchain. It can be the Hexagon toolchain included in the SDK, for example `HEXAGON_SDK_PATH/tools/HEXAGON_Tools/x.y.z/Tools`.  The `x.y.z` in the path is the toolchain version number, which is specific to the version of the SDK."
+export HEXAGON_TOOLCHAIN="Path to Hexagon toolchain. It can be the Hexagon toolchain included in the SDK, for example `HEXAGON_SDK_ROOT/tools/HEXAGON_Tools/x.y.z/Tools`.  The `x.y.z` in the path is the toolchain version number, which is specific to the version of the SDK."
 ```
 
 You can find more information about downloading [Hexagon SDK](https://developer.qualcomm.com/software/hexagon-dsp-sdk).
@@ -104,7 +104,7 @@ You have the options of running Hexagon test on real hardware or on Hexagon simu
 ```bash
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:"path to `llvm-clang/lib` sub-directory"
 
-export HEXAGON_TOOLCHAIN="Path to Hexagon toolchain. It can be the Hexagon toolchain included in the HexagonSDK, for example `HEXAGON_SDK_PATH/tools/HEXAGON_Tools/x.y.z/Tools`.  The `x.y.z` in the path is the toolchain version number, which is specific to the version of the SDK."
+export HEXAGON_TOOLCHAIN="Path to Hexagon toolchain. It can be the Hexagon toolchain included in the HexagonSDK, for example `HEXAGON_SDK_ROOT/tools/HEXAGON_Tools/x.y.z/Tools`.  The `x.y.z` in the path is the toolchain version number, which is specific to the version of the SDK."
 
 export PYTHONPATH=$PYTHONPATH:"path to `tvm/python`"
 ```
diff --git a/tests/scripts/task_build_hexagon_api.sh b/tests/scripts/task_build_hexagon_api.sh
index a3b501d9c5546..8e8397a424dbb 100755
--- a/tests/scripts/task_build_hexagon_api.sh
+++ b/tests/scripts/task_build_hexagon_api.sh
@@ -41,7 +41,7 @@ cmake -DANDROID_ABI=arm64-v8a \
     -DANDROID_PLATFORM=android-28 \
     -DUSE_ANDROID_TOOLCHAIN="${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake" \
     -DUSE_HEXAGON_ARCH=v68 \
-    -DUSE_HEXAGON_SDK="${HEXAGON_SDK_PATH}" \
+    -DUSE_HEXAGON_SDK="${HEXAGON_SDK_ROOT}" \
     -DUSE_HEXAGON_TOOLCHAIN="${HEXAGON_TOOLCHAIN}" \
     -DUSE_OUTPUT_BINARY_DIR="${output_binary_directory}" \
     -DUSE_HEXAGON_GTEST="${HEXAGON_SDK_PATH}/utils/googletest/gtest" ..
diff --git a/tests/scripts/task_config_build_hexagon.sh b/tests/scripts/task_config_build_hexagon.sh
index 7bce64cddb5a9..a38180a2d9713 100755
--- a/tests/scripts/task_config_build_hexagon.sh
+++ b/tests/scripts/task_config_build_hexagon.sh
@@ -31,6 +31,6 @@ echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
 echo set\(USE_LLVM "${CLANG_LLVM_HOME}/bin/llvm-config"\) >> config.cmake
 echo set\(CMAKE_CXX_COMPILER "/opt/sccache/clang++"\) >> config.cmake
 echo set\(USE_HEXAGON "ON"\) >> config.cmake
-echo set\(USE_HEXAGON_SDK "${HEXAGON_SDK_PATH}"\) >> config.cmake
+echo set\(USE_HEXAGON_SDK "${HEXAGON_SDK_ROOT}"\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake
diff --git a/tests/scripts/task_python_hexagon.sh b/tests/scripts/task_python_hexagon.sh
index 274b348f09350..883c296c5056a 100755
--- a/tests/scripts/task_python_hexagon.sh
+++ b/tests/scripts/task_python_hexagon.sh
@@ -38,9 +38,6 @@ if [[ "${device_serial}" == "simulator" ]]; then
 
     # Temporary workaround for symbol visibility
     export HEXAGON_SHARED_LINK_FLAGS="-Lbuild/hexagon_api_output -lhexagon_rpc_sim"
-
-    # HEXAGON_TOOLCHAIN is already set
-    export HEXAGON_SDK_ROOT=${HEXAGON_SDK_PATH}
 fi
 
 export ANDROID_SERIAL_NUMBER=${device_serial}

From ab8dfa151dfc965672bb4af6b752ddb50c9176ff Mon Sep 17 00:00:00 2001
From: Thomas Viehmann <tv.code@beamnet.de>
Date: Wed, 18 May 2022 23:06:24 +0200
Subject: [PATCH 28/59] use libtorch c++ distribution with c++11 strings in gpu
 image (#11346)

* use libtorch c++ distribution with c++11 strings in gpu image

* libtorch path

* don't activate libtorch before merging the image
---
 docker/Dockerfile.ci_gpu                  |  3 +++
 docker/install/ubuntu_install_libtorch.sh | 27 +++++++++++++++++++++++
 2 files changed, 30 insertions(+)
 create mode 100755 docker/install/ubuntu_install_libtorch.sh

diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index 5d0a642d3f202..73d13007f1d06 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -85,6 +85,9 @@ RUN bash /install/ubuntu_install_darknet.sh
 COPY install/ubuntu_install_onnx.sh /install/ubuntu_install_onnx.sh
 RUN bash /install/ubuntu_install_onnx.sh
 
+COPY install/ubuntu_install_libtorch.sh /install/ubuntu_install_libtorch.sh
+RUN bash /install/ubuntu_install_libtorch.sh
+
 COPY install/ubuntu_install_tflite.sh /install/ubuntu_install_tflite.sh
 RUN bash /install/ubuntu_install_tflite.sh
 
diff --git a/docker/install/ubuntu_install_libtorch.sh b/docker/install/ubuntu_install_libtorch.sh
new file mode 100755
index 0000000000000..d7eddc85402a9
--- /dev/null
+++ b/docker/install/ubuntu_install_libtorch.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+pushd /usr/local/
+wget -q https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.11.0%2Bcpu.zip
+unzip libtorch-cxx11-abi-shared-with-deps-1.11.0+cpu.zip
+# now it is in /usr/local/libtorch
+popd

From 9aaf96ef13ec2f13fe677c023a10c5b81d1f5d8a Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 18 May 2022 14:07:34 -0700
Subject: [PATCH 29/59] [ci][actions] Add more HTTP retries for conda (#11360)

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .github/actions/setup/action.yml |  1 +
 conda/condarc                    | 42 ++++++++++++++++++++++++++++++++
 tests/lint/check_file_type.py    |  1 +
 3 files changed, 44 insertions(+)
 create mode 100644 conda/condarc

diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml
index 0ce2023ae4e0d..81a0d4d48a8db 100644
--- a/.github/actions/setup/action.yml
+++ b/.github/actions/setup/action.yml
@@ -15,6 +15,7 @@ runs:
       auto-activate-base: false
       use-only-tar-bz2: true
       python-version: 3.7
+      condarc-file: conda/condarc
   - name: Conda info
     shell: pwsh
     run: |
diff --git a/conda/condarc b/conda/condarc
new file mode 100644
index 0000000000000..eef4967f90fea
--- /dev/null
+++ b/conda/condarc
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# See https://docs.conda.io/projects/conda/en/latest/configuration.html for details
+
+# remote_connect_timeout_secs (float)
+#   The number seconds conda will wait for your client to establish a
+#   connection to a remote url resource.
+# 
+remote_connect_timeout_secs: 10
+
+# remote_max_retries (int)
+#   The maximum number of retries each HTTP connection should attempt.
+# 
+remote_max_retries: 6
+
+# remote_backoff_factor (int)
+#   The factor determines the time HTTP connection should wait for
+#   attempt.
+# 
+remote_backoff_factor: 5
+
+# remote_read_timeout_secs (float)
+#   Once conda has connected to a remote resource and sent an HTTP
+#   request, the read timeout is the number of seconds conda will wait for
+#   the server to send a response.
+# 
+remote_read_timeout_secs: 60.0
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index b01174bfee4c2..4dc0109bdef89 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -100,6 +100,7 @@
     "Makefile",
     "Doxyfile",
     "pylintrc",
+    "condarc",
     "rat-excludes",
     "log4j.properties",
     ".clang-format",

From c32224f314cf6128ddc2801a120232d9ffa80a54 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 18 May 2022 15:18:33 -0700
Subject: [PATCH 30/59] [skip ci] Revert "Fix function number datatype from
 char to uint16_t (#10014)" (#11363)

This reverts commit f34bd22ddc4e7064eabe9fac42c4c04f54ede399.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 include/tvm/runtime/crt/func_registry.h | 27 +----------------
 src/runtime/crt/common/func_registry.c  | 39 +++++++------------------
 src/target/func_registry_generator.cc   |  8 +----
 tests/crt/func_registry_test.cc         |  7 ++---
 4 files changed, 15 insertions(+), 66 deletions(-)

diff --git a/include/tvm/runtime/crt/func_registry.h b/include/tvm/runtime/crt/func_registry.h
index 50737f8717987..4f8a19af591e8 100644
--- a/include/tvm/runtime/crt/func_registry.h
+++ b/include/tvm/runtime/crt/func_registry.h
@@ -42,7 +42,7 @@ typedef struct TVMFuncRegistry {
   /*! \brief Names of registered functions, concatenated together and separated by \0.
    * An additional \0 is present at the end of the concatenated blob to mark the end.
    *
-   * Byte 0 and 1 are the number of functions in `funcs`.
+   * Byte 0 is the number of functions in `funcs`.
    */
   const char* names;
 
@@ -50,31 +50,6 @@ typedef struct TVMFuncRegistry {
   const TVMBackendPackedCFunc* funcs;
 } TVMFuncRegistry;
 
-/*!
- * \brief Get the of the number of functions from registry.
- *
- * \param reg TVMFunctionRegistry instance that contains the function.
- * \return The number of functions from registry.
- */
-uint16_t TVMFuncRegistry_GetNumFuncs(const TVMFuncRegistry* reg);
-
-/*!
- * \brief Set the number of functions to registry.
- *
- * \param reg TVMFunctionRegistry instance that contains the function.
- * \param num_funcs The number of functions
- * \return 0 when successful.
- */
-int TVMFuncRegistry_SetNumFuncs(const TVMFuncRegistry* reg, const uint16_t num_funcs);
-
-/*!
- * \brief Get the address of 0th function from registry.
- *
- * \param reg TVMFunctionRegistry instance that contains the function.
- * \return the address of 0th function from registry
- */
-const char* TVMFuncRegistry_Get0thFunctionName(const TVMFuncRegistry* reg);
-
 /*!
  * \brief Get packed function from registry by name.
  *
diff --git a/src/runtime/crt/common/func_registry.c b/src/runtime/crt/common/func_registry.c
index 49cef8fd70eb3..116a5c496f1bd 100644
--- a/src/runtime/crt/common/func_registry.c
+++ b/src/runtime/crt/common/func_registry.c
@@ -60,29 +60,14 @@ int strcmp_cursor(const char** cursor, const char* name) {
   return return_value;
 }
 
-uint16_t TVMFuncRegistry_GetNumFuncs(const TVMFuncRegistry* reg) {
-  uint16_t num_funcs;
-  memcpy(&num_funcs, reg->names, sizeof(num_funcs));
-  return num_funcs;
-}
-
-int TVMFuncRegistry_SetNumFuncs(const TVMFuncRegistry* reg, const uint16_t num_funcs) {
-  memcpy((char*)reg->names, &num_funcs, sizeof(num_funcs));
-  return 0;
-}
-
-const char* TVMFuncRegistry_Get0thFunctionName(const TVMFuncRegistry* reg) {
-  // NOTE: first function name starts at index 2 to skip num_funcs.
-  return (reg->names + sizeof(uint16_t));
-}
-
 tvm_crt_error_t TVMFuncRegistry_Lookup(const TVMFuncRegistry* reg, const char* name,
                                        tvm_function_index_t* function_index) {
   tvm_function_index_t idx;
-  const char* reg_name_ptr = TVMFuncRegistry_Get0thFunctionName(reg);
+  const char* reg_name_ptr;
 
   idx = 0;
-  for (; *reg_name_ptr != '\0'; reg_name_ptr++) {
+  // NOTE: reg_name_ptr starts at index 1 to skip num_funcs.
+  for (reg_name_ptr = reg->names + 1; *reg_name_ptr != '\0'; reg_name_ptr++) {
     if (!strcmp_cursor(&reg_name_ptr, name)) {
       *function_index = idx;
       return kTvmErrorNoError;
@@ -97,9 +82,9 @@ tvm_crt_error_t TVMFuncRegistry_Lookup(const TVMFuncRegistry* reg, const char* n
 tvm_crt_error_t TVMFuncRegistry_GetByIndex(const TVMFuncRegistry* reg,
                                            tvm_function_index_t function_index,
                                            TVMBackendPackedCFunc* out_func) {
-  uint16_t num_funcs;
+  uint8_t num_funcs;
 
-  num_funcs = TVMFuncRegistry_GetNumFuncs(reg);
+  num_funcs = reg->names[0];
   if (function_index >= num_funcs) {
     return kTvmErrorFunctionIndexInvalid;
   }
@@ -116,8 +101,7 @@ tvm_crt_error_t TVMMutableFuncRegistry_Create(TVMMutableFuncRegistry* reg, uint8
 
   reg->registry.names = (const char*)buffer;
   buffer[0] = 0;  // number of functions present in buffer.
-  buffer[1] = 0;  // note that we combine the first two elements to form a 16-bit function index.
-  buffer[2] = 0;  // end of names list marker.
+  buffer[1] = 0;  // end of names list marker.
 
   // compute a guess of the average size of one entry:
   //  - assume average function name is around ~10 bytes
@@ -133,12 +117,13 @@ tvm_crt_error_t TVMMutableFuncRegistry_Create(TVMMutableFuncRegistry* reg, uint8
 tvm_crt_error_t TVMMutableFuncRegistry_Set(TVMMutableFuncRegistry* reg, const char* name,
                                            TVMBackendPackedCFunc func, int override) {
   size_t idx;
-  char* reg_name_ptr = (char*)TVMFuncRegistry_Get0thFunctionName(&(reg->registry));
+  char* reg_name_ptr;
 
   idx = 0;
   // NOTE: safe to discard const qualifier here, since reg->registry.names was set from
   // TVMMutableFuncRegistry_Create above.
-  for (; *reg_name_ptr != 0; reg_name_ptr++) {
+  // NOTE: reg_name_ptr starts at index 1 to skip num_funcs.
+  for (reg_name_ptr = (char*)reg->registry.names + 1; *reg_name_ptr != 0; reg_name_ptr++) {
     if (!strcmp_cursor((const char**)&reg_name_ptr, name)) {
       if (override == 0) {
         return kTvmErrorFunctionAlreadyDefined;
@@ -164,11 +149,7 @@ tvm_crt_error_t TVMMutableFuncRegistry_Set(TVMMutableFuncRegistry* reg, const ch
   reg_name_ptr += name_len + 1;
   *reg_name_ptr = 0;
   ((TVMBackendPackedCFunc*)reg->registry.funcs)[idx] = func;
-
-  uint16_t num_funcs;
-  // increment num_funcs.
-  num_funcs = TVMFuncRegistry_GetNumFuncs(&(reg->registry)) + 1;
-  TVMFuncRegistry_SetNumFuncs(&(reg->registry), num_funcs);
+  ((char*)reg->registry.names)[0]++;  // increment num_funcs.
 
   return kTvmErrorNoError;
 }
diff --git a/src/target/func_registry_generator.cc b/src/target/func_registry_generator.cc
index d679bf379b628..7c948d50cbb94 100644
--- a/src/target/func_registry_generator.cc
+++ b/src/target/func_registry_generator.cc
@@ -31,13 +31,7 @@ namespace target {
 
 std::string GenerateFuncRegistryNames(const Array<String>& function_names) {
   std::stringstream ss;
-
-  unsigned char function_nums[sizeof(uint16_t)];
-  *reinterpret_cast<uint16_t*>(function_nums) = function_names.size();
-  for (auto f : function_nums) {
-    ss << f;
-  }
-
+  ss << (unsigned char)(function_names.size());
   for (auto f : function_names) {
     ss << f << '\0';
   }
diff --git a/tests/crt/func_registry_test.cc b/tests/crt/func_registry_test.cc
index 5962a3acee39f..9f0e7f8d1a5aa 100644
--- a/tests/crt/func_registry_test.cc
+++ b/tests/crt/func_registry_test.cc
@@ -82,7 +82,7 @@ TEST(StrCmpScan, Test) {
 }
 
 TEST(FuncRegistry, Empty) {
-  TVMFuncRegistry registry{"\000\000", NULL};
+  TVMFuncRegistry registry{"\000", NULL};
 
   EXPECT_EQ(kTvmErrorFunctionNameNotFound, TVMFuncRegistry_Lookup(&registry, "foo", NULL));
   EXPECT_EQ(kTvmErrorFunctionIndexInvalid,
@@ -101,7 +101,7 @@ static int Bar(TVMValue* args, int* type_codes, int num_args, TVMValue* out_ret_
 }
 
 // Matches the style of registry defined in generated C modules.
-const char* kBasicFuncNames = "\002\000Foo\0Bar\0";  // NOTE: final \0
+const char* kBasicFuncNames = "\002Foo\0Bar\0";  // NOTE: final \0
 const TVMBackendPackedCFunc funcs[2] = {&Foo, &Bar};
 const TVMFuncRegistry kConstRegistry = {kBasicFuncNames, (const TVMBackendPackedCFunc*)funcs};
 
@@ -111,8 +111,7 @@ TEST(FuncRegistry, ConstGlobalRegistry) {
 
   // Foo
   EXPECT_EQ(kBasicFuncNames[0], 2);
-  EXPECT_EQ(kBasicFuncNames[1], 0);
-  EXPECT_EQ(kBasicFuncNames[2], 'F');
+  EXPECT_EQ(kBasicFuncNames[1], 'F');
   EXPECT_EQ(kTvmErrorNoError, TVMFuncRegistry_Lookup(&kConstRegistry, "Foo", &func_index));
   EXPECT_EQ(0, func_index);
 

From ddfa1da691bacbb0018b53fca8409c5cfd6dbf3a Mon Sep 17 00:00:00 2001
From: Mohamad Katanbaf <mtkatanbaf@gmail.com>
Date: Wed, 18 May 2022 16:09:10 -0700
Subject: [PATCH 31/59] [bug fix] skip "__nop" functions in
 graph_executor_debug (#11353)

* bug fix, skip __nop functions in running operation over RPC

Co-authored-by: Mohamad <mkatanbaf@users.noreply.github.com>
---
 src/runtime/graph_executor/debug/graph_executor_debug.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc
index 97d89206f5dcb..bd3b0db0403f3 100644
--- a/src/runtime/graph_executor/debug/graph_executor_debug.cc
+++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc
@@ -140,6 +140,11 @@ class GraphExecutorDebug : public GraphExecutor {
       return 0;
     }
 
+    if (nodes_[index].param.func_name == "__nop") {
+      LOG_INFO << "Skipping __nop function";
+      return 0;
+    }
+
     const Device& dev = data_entry_[entry_id(index, 0)]->device;
     TVMOpParam param = nodes_[index].param;
     std::string name = param.func_name;

From ffc0443913d837c6b7a6ec55375ea29cf3d1fa7c Mon Sep 17 00:00:00 2001
From: heliqi <1101791222@qq.com>
Date: Thu, 19 May 2022 03:53:03 -0500
Subject: [PATCH 32/59] [Frontend] [PaddlePaddle] Add split operator (#11354)

* suuport split op of paddlepaddle

* black formatting
---
 python/tvm/relay/frontend/paddlepaddle.py     | 45 +++++++++++++++++++
 .../frontend/paddlepaddle/test_forward.py     | 38 ++++++++++++++++
 2 files changed, 83 insertions(+)

diff --git a/python/tvm/relay/frontend/paddlepaddle.py b/python/tvm/relay/frontend/paddlepaddle.py
index 7f2460d66eeb0..7042154709ae8 100644
--- a/python/tvm/relay/frontend/paddlepaddle.py
+++ b/python/tvm/relay/frontend/paddlepaddle.py
@@ -1920,6 +1920,50 @@ def convert_softsign(g, op, block):
     g.add_node(op.output("Out")[0], out)
 
 
+def convert_split(g, op, block):
+    """Operator converter for split."""
+
+    x = g.get_node(op.input("X")[0])
+    axis = op.input("AxisTensor")
+    if axis:
+        axis = g.get_node(axis[0])
+        axis, infered = try_infer_value(axis, g.get_params())
+        if infered:
+            axis = axis.tolist()[0]
+    else:
+        axis = op.attr("axis")
+
+    sections = op.input("SectionsTensorList")
+    if sections:
+        tmp_section = []
+        for i in sections:
+            i = g.get_node(i)
+            i, infered = try_infer_value(i, g.get_params())
+            if infered:
+                i = i.tolist()
+            else:
+                raise ValueError("Dynamic Split not yet supported.")
+            tmp_section.extend(i)
+        sections = tmp_section
+    else:
+        sections = op.attr("sections")
+    if sections:
+        indices = []
+        split_index = 0
+        for i in sections[:-1]:
+            if i == -1:
+                input_shape = infer_shape(x)[axis]
+                i = input_shape - np.sum(sections) - 1
+            split_index += i
+            indices.append(split_index)
+    else:
+        indices = op.attr("num")
+
+    out = _op.split(x, indices, axis)
+    for i, out_i in enumerate(out):
+        g.add_node(op.output("Out")[i], out_i)
+
+
 def convert_square(g, op, block):
     """Operator converter for square."""
 
@@ -2092,6 +2136,7 @@ def convert_unsqueeze(g, op, block):
     "softmax": convert_softmax,
     "softplus": convert_softplus,
     "softsign": convert_softsign,
+    "split": convert_split,
     "strided_slice": convert_slice,
     "sqrt": convert_unary_op,
     "square": convert_square,
diff --git a/tests/python/frontend/paddlepaddle/test_forward.py b/tests/python/frontend/paddlepaddle/test_forward.py
index 9fa4063755f70..0f243e0ea02c6 100644
--- a/tests/python/frontend/paddlepaddle/test_forward.py
+++ b/tests/python/frontend/paddlepaddle/test_forward.py
@@ -782,6 +782,44 @@ def full2(inputs):
     verify_model(full2, input_data=[input_data])
 
 
+@tvm.testing.uses_gpu
+def test_forward_split():
+    class Split(nn.Layer):
+        def __init__(
+            self, axis=None, num_or_sections=None, axis_is_tensor=False, num_is_tensor=False
+        ):
+            super(Split, self).__init__()
+            self.axis = axis
+            self.num_or_sections = num_or_sections
+            self.axis_is_tensor = axis_is_tensor
+            self.num_is_tensor = num_is_tensor
+
+        @paddle.jit.to_static
+        def forward(self, inputs):
+            axis = self.axis
+            if self.axis_is_tensor:
+                axis = paddle.to_tensor(axis, dtype="int32")
+            num_or_sections = self.num_or_sections
+            if self.num_is_tensor:
+                new_num_or_sections = []
+                for i in num_or_sections:
+                    if isinstance(i, list):
+                        i = paddle.to_tensor(i, dtype="int32")
+                    new_num_or_sections.append(i)
+                num_or_sections = new_num_or_sections
+            return paddle.split(inputs, num_or_sections=num_or_sections, axis=axis)
+
+    input_shape = [3, 6, 2]
+    input_data = paddle.rand(input_shape, dtype="float32")
+    verify_model(Split(axis=1, num_or_sections=3), input_data=input_data)
+    verify_model(
+        Split(axis=[1], num_or_sections=[2, 3, 1], axis_is_tensor=True), input_data=input_data
+    )
+    verify_model(
+        Split(axis=1, num_or_sections=[2, -1, [3]], num_is_tensor=True), input_data=input_data
+    )
+
+
 @tvm.testing.uses_gpu
 def test_forward_squeeze():
     class Squeeze(nn.Layer):

From 534c38bef3c98f8094bce6780cabdeedb017645b Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Thu, 19 May 2022 04:37:41 -0700
Subject: [PATCH 33/59] [Relay] Support i16, f16 scalars in Relay text (#11224)

While testing fp16 models for Collage discovered the Relay text
format did not support f16. While adding that cleaned up scalar handling
in general. However I left two inlined tests for 'is simple const'
in place (fuse_ops.cc and memory_alloc.cc) since it's not clear whether
they should remain specific to just {i,f}{32,64} or whether they can
be replaced with the support::IsSimpleScalar central predicate.
---
 src/parser/parser.cc                       |  45 +----
 src/parser/tokenizer.h                     | 104 +++++++----
 src/printer/doc.cc                         |   7 +-
 src/printer/relay_text_printer.cc          |  80 ++++----
 src/printer/text_printer.h                 |   7 -
 src/support/scalars.cc                     | 202 +++++++++++++++++++++
 src/support/scalars.h                      |  67 +++++++
 tests/cpp/support/scalars_test.cc          |  63 +++++++
 tests/python/relay/test_ir_parser.py       |  41 ++++-
 tests/python/relay/test_ir_text_printer.py |  37 ++--
 10 files changed, 505 insertions(+), 148 deletions(-)
 create mode 100644 src/support/scalars.cc
 create mode 100644 src/support/scalars.h
 create mode 100644 tests/cpp/support/scalars_test.cc

diff --git a/src/parser/parser.cc b/src/parser/parser.cc
index 9b15893092f7f..f51e3e5c9737f 100644
--- a/src/parser/parser.cc
+++ b/src/parser/parser.cc
@@ -35,10 +35,12 @@
 
 #include <fstream>
 
+#include "../support/scalars.h"
 #include "./meta_ref.h"
 #include "./op_table.h"
 #include "./span_check.h"
 #include "./tokenizer.h"
+#include "tvm/runtime/builtin_fp16.h"
 
 namespace tvm {
 namespace parser {
@@ -534,49 +536,15 @@ class Parser {
   /*! \brief Convert a numeric token to an NDArray for embedding into the Relay program. */
   NDArray NumberToNDArray(const Token& token) {
     if (token->token_type == TokenType::kInteger) {
-      DLDevice dev = {DLDeviceType::kDLCPU, 0};
-      int64_t i = Downcast<tvm::Integer>(token->data);
-      if (i > std::numeric_limits<int32_t>::max()) {
-        auto dtype = String2DLDataType("int64");
-        auto data = NDArray::Empty({}, dtype, dev);
-        auto array = reinterpret_cast<int64_t*>(data->data);
-        // revisit this, literal node issue.
-        array[0] = i;
-        return data;
-      } else {
-        auto dtype = String2DLDataType("int32");
-        auto data = NDArray::Empty({}, dtype, dev);
-        auto array = reinterpret_cast<int32_t*>(data->data);
-        // revisit this, literal node issue.
-        array[0] = i;
-        return data;
-      }
+      return support::IntImmToNDArray(Downcast<tvm::IntImm>(token->data));
     } else if (token->token_type == TokenType::kFloat) {
-      DLDevice dev = {DLDeviceType::kDLCPU, 0};
-      auto float_imm = Downcast<tvm::FloatImm>(token->data);
-      auto data = NDArray::Empty({}, float_imm->dtype, dev);
-      auto array = reinterpret_cast<float*>(data->data);
-      // revisit this, literal node issue.
-      // TODO(@jroesch): bounds checking
-      float value = float_imm->value;
-      array[0] = value;
-      return data;
+      return support::FloatImmToNDArray(Downcast<tvm::FloatImm>(token->data));
     } else {
       LOG(FATAL) << "internal error: should only call this function on numeric tokens";
-      return NDArray();
+      return {};
     }
   }
 
-  /*! \brief Convert a boolean value to an NDArray for embedding into the Relay program. */
-  NDArray BooleanToNDarray(bool value) {
-    DLDevice dev = {DLDeviceType::kDLCPU, 0};
-    auto dtype = String2DLDataType("bool");
-    auto data = NDArray::Empty({}, dtype, dev);
-    auto array = reinterpret_cast<bool*>(data->data);
-    array[0] = value;
-    return data;
-  }
-
   [[noreturn]] void ParseError(const Token& token, const std::string& msg) {
     throw std::runtime_error(msg);
   }
@@ -1573,8 +1541,7 @@ class Parser {
         case TokenType::kBoolean: {
           Consume(TokenType::kBoolean);
           int64_t value = Downcast<tvm::Integer>(next->data);
-          auto boolean = BooleanToNDarray(value);
-          Expr e = Constant(boolean, next->span);
+          Expr e = Constant(support::BoolToNDArray(value), next->span);
           ICHECK(e->span.defined()) << "constant spans must be defined";
           return e;
         }
diff --git a/src/parser/tokenizer.h b/src/parser/tokenizer.h
index f8098cf941005..4ac1ceef26dce 100644
--- a/src/parser/tokenizer.h
+++ b/src/parser/tokenizer.h
@@ -34,6 +34,7 @@
 #include <utility>
 #include <vector>
 
+#include "../support/scalars.h"
 #include "./meta_ref.h"
 #include "./token.h"
 
@@ -174,35 +175,16 @@ struct Tokenizer {
   Token ParseNumber(bool is_pos, bool is_float, std::string number) {
     ICHECK(number.size() > 0) << "an empty string is an invalid number";
 
-    if (!is_float) {
-      auto token = NewToken(TokenType::kInteger);
-      size_t index = 0;
-      int64_t value = 0;
-      try {
-        value = std::stoll(number, &index);
-      } catch (const std::invalid_argument& err) {
-        this->diag_ctx.Emit(Diagnostic::Error(token->span) << "invalid number `" << number << "`");
-      } catch (const std::out_of_range& err) {
-        this->diag_ctx.Emit(Diagnostic::Error(token->span) << "invalid number `" << number << "`");
-      }
-      if (number.size() <= index) {
-        value = is_pos ? value : -value;
-        if (value > std::numeric_limits<int32_t>::max()) {
-          token->data = tvm::IntImm(DataType::Int(64), value);
-        } else {
-          token->data = tvm::IntImm(DataType::Int(32), value);
-        }
-        return token;
-      }
+    Token token = NewToken(is_float ? TokenType::kFloat : TokenType::kInteger);
+    size_t suffix_pos = number.rfind(is_float ? 'f' : 'i');
+    if (suffix_pos == std::string::npos) {
+      suffix_pos = number.size();
+    }
+    std::string literal_text = number.substr(0, suffix_pos);
+    std::string suffix;
+    if (suffix_pos < number.size()) {
+      suffix = number.substr(suffix_pos + 1, number.size() - suffix_pos);
     }
-    auto token = NewToken(TokenType::kFloat);
-
-    auto suffix_pos = number.rfind("f");
-
-    auto literal_text = number.substr(0, suffix_pos);
-
-    auto suffix = number.substr(suffix_pos + 1, number.size() - suffix_pos);
-
     int width = 32;
 
     if (suffix.size()) {
@@ -217,9 +199,62 @@ struct Tokenizer {
       }
     }
 
-    double value = stod(literal_text);
-    value = is_pos ? value : -value;
-    token->data = tvm::FloatImm(DataType::Float(width), value);
+    if (is_float) {
+      double value = 0.0;
+      size_t index = 0;
+      try {
+        value = stod(literal_text, &index);
+      } catch (const std::invalid_argument& err) {
+        this->diag_ctx.Emit(Diagnostic::Error(token->span)
+                            << "invalid floating point number `" << literal_text << "`");
+      } catch (const std::out_of_range& err) {
+        this->diag_ctx.Emit(Diagnostic::Error(token->span)
+                            << "invalid floating point number `" << literal_text << "`");
+      }
+      if (index < literal_text.size()) {
+        this->diag_ctx.Emit(Diagnostic::Error(token->span)
+                            << "invalid floating point number `" << literal_text << "`");
+      }
+      value = is_pos ? value : -value;
+      token->data = support::ValueToFloatImm(value, width);
+      if (!token->data.defined()) {
+        this->diag_ctx.Emit(Diagnostic::Error(token->span)
+                            << "floating point number `" << literal_text
+                            << "` unrepresentable in width " << width);
+        token->data = support::ValueToFloatImm(0.0, width);
+      }
+    } else {
+      int64_t value = 0;
+      size_t index = 0;
+      try {
+        value = std::stoll(literal_text, &index);
+      } catch (const std::invalid_argument& err) {
+        this->diag_ctx.Emit(Diagnostic::Error(token->span)
+                            << "invalid integer number `" << literal_text << "`");
+      } catch (const std::out_of_range& err) {
+        this->diag_ctx.Emit(Diagnostic::Error(token->span)
+                            << "invalid integer number `" << literal_text << "`");
+      }
+      if (index < literal_text.size()) {
+        this->diag_ctx.Emit(Diagnostic::Error(token->span)
+                            << "invalid integer number `" << literal_text << "`");
+      }
+      value = is_pos ? value : -value;
+      token->data = support::ValueToIntImm(value, width);
+      if (!token->data.defined() && suffix.empty()) {
+        // Without any i suffix the legacy behavior was to default to int64 if out of range
+        // for int32.
+        width = 64;
+        token->data = support::ValueToIntImm(value, width);
+      }
+      if (!token->data.defined()) {
+        this->diag_ctx.Emit(Diagnostic::Error(token->span)
+                            << "integer number `" << literal_text << "` unrepresentable in width "
+                            << width);
+        token->data = support::ValueToIntImm(0, width);
+      }
+    }
+
     return token;
   }
 
@@ -230,14 +265,13 @@ struct Tokenizer {
     }
 
     bool is_float = false;
-
-    // Remove trailing floating point prefix.
-    if (More() && Peek() == 'f') {
+    if (More() && (Peek() == 'f' || Peek() == 'i')) {
+      is_float = Peek() == 'f';
+      // Capture trailing width suffix
       ss << Next();
       while (More() && IsNumeric(Peek())) {
         ss << Next();
       }
-      is_float = true;
     }
     return ParseNumber(is_pos, is_float, ss.str());
   }
diff --git a/src/printer/doc.cc b/src/printer/doc.cc
index f7d9fdfd7dfb3..b06995fb1286a 100644
--- a/src/printer/doc.cc
+++ b/src/printer/doc.cc
@@ -52,12 +52,7 @@ TVM_REGISTER_OBJECT_TYPE(DocTextNode);
 
 class DocText : public DocAtom {
  public:
-  explicit DocText(std::string str) {
-    if (str.find_first_of("\t\n") != str.npos) {
-      LOG(WARNING) << "text node: '" << str << "' should not have tab or newline.";
-    }
-    data_ = runtime::make_object<DocTextNode>(str);
-  }
+  explicit DocText(std::string str) { data_ = runtime::make_object<DocTextNode>(str); }
 
   TVM_DEFINE_OBJECT_REF_METHODS(DocText, DocAtom, DocTextNode);
 };
diff --git a/src/printer/relay_text_printer.cc b/src/printer/relay_text_printer.cc
index 97231931ad88e..35daf588fbeb8 100644
--- a/src/printer/relay_text_printer.cc
+++ b/src/printer/relay_text_printer.cc
@@ -43,9 +43,11 @@
 #include "../ir/attr_functor.h"
 #include "../parser/meta_ref.h"
 #include "../relay/analysis/dependency_graph.h"
+#include "../support/scalars.h"
 #include "doc.h"
 #include "meta_data.h"
 #include "text_printer.h"
+#include "tvm/runtime/builtin_fp16.h"
 
 namespace tvm {
 namespace relay {
@@ -61,8 +63,17 @@ Doc RelayTextPrinter::PrintOptionalInfo(const Expr& expr) {
   }
   // default annotations
   if (annotate_ == nullptr) {
-    if ((expr.as<ConstantNode>() || expr.as<CallNode>()) && expr->checked_type_.defined()) {
-      doc << " /* ty=" << Print(expr->checked_type()) << " */";
+    if ((expr.as<ConstantNode>() || expr.as<CallNode>() || expr.as<VarNode>() ||
+         expr.as<FunctionNode>() || expr.as<TupleNode>() || expr.as<TupleGetItemNode>()) &&
+        (expr->checked_type_.defined() || expr->span.defined())) {
+      doc << " /*";
+      if (expr->checked_type_.defined()) {
+        doc << " ty=" << Print(expr->checked_type());
+      }
+      if (expr->span.defined()) {
+        doc << " span=" << PrintSpan(expr->span);
+      }
+      doc << " */";
     }
   } else {
     std::string annotated_expr = annotate_(expr);
@@ -219,7 +230,7 @@ Doc RelayTextPrinter::AllocVar(const Var& var) {
     name = "v" + name;
   }
   Doc val = GetUniqueName("%" + name);
-  memo_[var] = val;
+  memo_[var] = val;  // Referential occurrences will not include the following.
   if (!var->virtual_device()->IsFullyUnconstrained()) {
     val << " {" << kVirtualDevice << "=" << PrintAttributeValue(var->virtual_device()) << "}";
   }
@@ -335,51 +346,17 @@ Doc RelayTextPrinter::PrintExpr(const Expr& expr, bool meta, bool try_inline, bo
 // first time.
 Doc RelayTextPrinter::VisitExpr_(const VarNode* op) { return AllocVar(GetRef<Var>(op)); }
 
-/*!
- * \brief special method to print out const scalar
- * \param dtype The data type
- * \param value The value to be printed.
- */
-template <typename T>
-Doc RelayTextPrinter::ScalarLiteral(DataType dtype, const T& value) {
-  std::ostringstream os;
-  if (dtype == DataType::Int(32)) {
-    os << value;
-  } else if (dtype == DataType::Float(32)) {
-    os << value << 'f';
-  } else if (dtype == DataType::Float(64)) {
-    os << value << "f64";
-  } else if (dtype == DataType::Bool()) {
-    return Doc::PyBoolLiteral(value != 0);
-  } else {
-    os << value;
-  }
-  return Doc::Text(os.str());
-}
-
 Doc RelayTextPrinter::VisitExpr_(const ConstantNode* op) {
   // Print out simple scalars directly.
-  if (op->is_scalar()) {
-    std::ostringstream os;
-    DataType dtype = DataType(op->data->dtype);
-    ICHECK_EQ(op->data->device.device_type, kDLCPU);
-    if (dtype == DataType::Int(32)) {
-      return ScalarLiteral(dtype, static_cast<const int32_t*>(op->data->data)[0]);
-    } else if (dtype == DataType::Int(64)) {
-      return ScalarLiteral(dtype, static_cast<const int64_t*>(op->data->data)[0]);
-    } else if (dtype == DataType::Float(32)) {
-      return ScalarLiteral(dtype, static_cast<const float*>(op->data->data)[0]);
-    } else if (dtype == DataType::Float(64)) {
-      return ScalarLiteral(dtype, static_cast<const double*>(op->data->data)[0]);
-    } else if (dtype == DataType::Bool()) {
-      return ScalarLiteral(dtype, static_cast<const uint8_t*>(op->data->data)[0]);
-    }
+  if (support::IsSimpleScalar(op)) {
+    return Doc::Text(support::NDArrayScalarToString(op->data));
   }
-  // default fall-back, record it as meta node.
+  // Fallbock: record it as a meta node.
   Doc doc;
   // Don't append optional_info. Because the entry function is Print,
   // and it will append the optional_info afterwards.
-  return doc << PrintExpr(GetRef<Expr>(op), true, false, false);
+  return doc << PrintExpr(GetRef<Expr>(op), /*meta=*/true, /*try_inline=*/false,
+                          /*optional_info=*/false);
 }
 
 Doc RelayTextPrinter::VisitExpr_(const TupleNode* op) {
@@ -540,9 +517,6 @@ Doc RelayTextPrinter::VisitExpr_(const CallNode* op) {
     return doc;
   } else {
     doc << "(" << Doc::Concat(args) << ")";
-    if (op->span.defined()) {
-      doc << " /* " << PrintSpan(op->span) << " */";
-    }
     return doc;
   }
 }
@@ -799,11 +773,21 @@ Doc RelayTextPrinter::VisitAttr_(const ArrayNode* op) {
 }
 
 Doc RelayTextPrinter::VisitAttr_(const tir::IntImmNode* op) {
-  return ScalarLiteral(op->dtype, op->value);
+  if (support::IsSimpleScalarDtype(op->dtype)) {
+    return Doc::Text(support::IntImmToString(GetRef<IntImm>(op)));
+  } else {
+    // Fallback: Print int64_t without width suffix.
+    return Doc::Text(std::to_string(op->value));
+  }
 }
 
 Doc RelayTextPrinter::VisitAttr_(const tir::FloatImmNode* op) {
-  return ScalarLiteral(op->dtype, op->value);
+  if (support::IsSimpleScalarDtype(op->dtype)) {
+    return Doc::Text(support::FloatImmToString(GetRef<FloatImm>(op)));
+  } else {
+    // Fallbock: Print double without width suffix.
+    return Doc::Text(std::to_string(op->value));
+  }
 }
 
 Doc RelayTextPrinter::VisitAttr_(const tir::StringImmNode* op) {
@@ -977,7 +961,7 @@ Doc RelayTextPrinter::PrintSpan(const Span& span) {
   Doc doc;
   const auto* span_node = span.as<SpanNode>();
   ICHECK(span_node);
-  doc << span_node->source_name->name;
+  doc << span_node->source_name->name << ":" << span_node->line << ":" << span_node->column;
   return doc;
 }
 
diff --git a/src/printer/text_printer.h b/src/printer/text_printer.h
index c34c4a5b6dbee..05a00e3305e1b 100644
--- a/src/printer/text_printer.h
+++ b/src/printer/text_printer.h
@@ -152,13 +152,6 @@ class RelayTextPrinter : public ExprFunctor<Doc(const Expr&)>,
   // Should only be triggered when op is a free variable being visited for the
   // first time.
   Doc VisitExpr_(const VarNode* op) final;
-  /*!
-   * \brief special method to print out const scalar
-   * \param dtype The data type
-   * \param value The value to be printed.
-   */
-  template <typename T>
-  static Doc ScalarLiteral(DataType dtype, const T& value);
   Doc VisitExpr_(const ConstantNode* op) final;
   Doc VisitExpr_(const TupleNode* op) final;
   Doc VisitExpr_(const TupleGetItemNode* op) final;
diff --git a/src/support/scalars.cc b/src/support/scalars.cc
new file mode 100644
index 0000000000000..9caa7ca589156
--- /dev/null
+++ b/src/support/scalars.cc
@@ -0,0 +1,202 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/support/scalars.cc
+ * \brief Helpers for converting between scalars in native, text, TIR immediate and NDArray forms.
+ */
+
+#include "./scalars.h"
+
+#include "tvm/relay/expr.h"
+#include "tvm/runtime/builtin_fp16.h"
+
+namespace tvm {
+namespace support {
+
+/*! \brief The standard scalar dtypes. */
+static const DataType kInt16 = DataType::Int(16);
+static const DataType kInt32 = DataType::Int(32);
+static const DataType kInt64 = DataType::Int(64);
+static const DataType kFloat16 = DataType::Float(16);
+static const DataType kFloat32 = DataType::Float(32);
+static const DataType kFloat64 = DataType::Float(64);
+static const DataType kBool = DataType::Bool();
+
+bool IsSimpleScalarDtype(DataType dtype) {
+  return dtype == kInt16 || dtype == kInt32 || dtype == kInt64 || dtype == kFloat16 ||
+         dtype == kFloat32 || dtype == kFloat64 || dtype == kBool;
+}
+
+bool IsSimpleScalar(const relay::ConstantNode* constant_node) {
+  return constant_node->is_scalar() && IsSimpleScalarDtype(DataType(constant_node->data->dtype));
+}
+
+runtime::NDArray IntImmToNDArray(const IntImm& int_imm) {
+  DLDevice dev = {DLDeviceType::kDLCPU, 0};
+  auto data = runtime::NDArray::Empty({}, int_imm->dtype, dev);
+  if (int_imm.dtype() == kInt16) {
+    auto* array = reinterpret_cast<int16_t*>(data->data);
+    array[0] = static_cast<int16_t>(int_imm->value);
+  } else if (int_imm.dtype() == kInt32) {
+    auto* array = reinterpret_cast<int32_t*>(data->data);
+    array[0] = static_cast<int32_t>(int_imm->value);
+  } else if (int_imm.dtype() == kInt64) {
+    auto* array = reinterpret_cast<int64_t*>(data->data);
+    array[0] = int_imm->value;
+  } else {
+    LOG(FATAL) << "Unrecognized numeric literal dtype: " << DLDataType2String(int_imm.dtype());
+  }
+  return data;
+}
+
+runtime::NDArray FloatImmToNDArray(const FloatImm& float_imm) {
+  DLDevice dev = {DLDeviceType::kDLCPU, 0};
+  auto data = runtime::NDArray::Empty({}, float_imm->dtype, dev);
+  if (float_imm.dtype() == kFloat16) {
+    auto* array = reinterpret_cast<uint16_t*>(data->data);
+    array[0] = __gnu_f2h_ieee(static_cast<float>(float_imm->value));
+  } else if (float_imm.dtype() == kFloat32) {
+    auto* array = reinterpret_cast<float*>(data->data);
+    array[0] = static_cast<float>(float_imm->value);
+  } else if (float_imm.dtype() == kFloat64) {
+    auto* array = reinterpret_cast<double*>(data->data);
+    array[0] = float_imm->value;
+  } else {
+    LOG(FATAL) << "Unrecognized numeric literal dtype: " << DLDataType2String(float_imm.dtype());
+  }
+  return data;
+}
+
+runtime::NDArray BoolToNDArray(bool value) {
+  DLDevice dev = {DLDeviceType::kDLCPU, 0};
+  auto data = runtime::NDArray::Empty({}, kBool, dev);
+  auto array = reinterpret_cast<bool*>(data->data);
+  array[0] = value;
+  return data;
+}
+
+std::string NDArrayScalarToString(const runtime::NDArray& data) {
+  std::ostringstream os;
+  DataType dtype(data->dtype);
+  ICHECK_EQ(data->device.device_type, kDLCPU) << "Scalars must reside on the CPU to be printed";
+  if (dtype == kInt16) {
+    auto value = static_cast<const int16_t*>(data->data)[0];
+    os << value << "i16";
+  } else if (dtype == kInt32) {
+    auto value = static_cast<const int32_t*>(data->data)[0];
+    os << value;
+  } else if (dtype == kInt64) {
+    auto value = static_cast<const int64_t*>(data->data)[0];
+    os << value << "i64";
+  } else if (dtype == kFloat16) {
+    auto value = __gnu_h2f_ieee(static_cast<const uint16_t*>(data->data)[0]);
+    os << value << "f16";
+  } else if (dtype == kFloat32) {
+    auto value = static_cast<const float*>(data->data)[0];
+    os << value << "f";
+  } else if (dtype == kFloat64) {
+    auto value = static_cast<const double*>(data->data)[0];
+    os << value << "f64";
+  } else if (dtype == kBool) {
+    auto value = static_cast<const uint8_t*>(data->data)[0];
+    os << (value ? "True" : "False");
+  } else {
+    LOG(FATAL) << "Unrecognized NDArray scalar dtype: " << DLDataType2String(dtype);
+  }
+  return os.str();
+}
+
+std::string IntImmToString(const IntImm& int_imm) {
+  std::ostringstream os;
+  if (int_imm->dtype == kInt16) {
+    os << int_imm->value << "i16";
+  } else if (int_imm->dtype == kInt32) {
+    os << int_imm->value;
+  } else if (int_imm->dtype == kInt64) {
+    os << int_imm->value << "i64";
+  } else if (int_imm->dtype == kBool) {
+    os << (int_imm->value ? "True" : "False");
+  } else {
+    LOG(FATAL) << "Unrecognised IntImm dtype: " << DLDataType2String(int_imm->dtype);
+  }
+  return os.str();
+}
+
+std::string FloatImmToString(const FloatImm& float_imm) {
+  std::ostringstream os;
+  if (float_imm->dtype == kFloat16) {
+    os << float_imm->value << "f16";
+  } else if (float_imm->dtype == kFloat32) {
+    os << float_imm->value << "f";
+  } else if (float_imm->dtype == kFloat64) {
+    os << float_imm->value << "f64";
+  } else {
+    LOG(FATAL) << "Unrecognised FloatImm dtype: " << DLDataType2String(float_imm->dtype);
+  }
+  return os.str();
+}
+
+IntImm ValueToIntImm(int64_t value, int width) {
+  if (width == 16) {
+    if (value < std::numeric_limits<int16_t>::min() ||
+        value > std::numeric_limits<int16_t>::max()) {
+      return {};
+    }
+    return IntImm(kInt16, value);
+  } else if (width == 32) {
+    if (value < std::numeric_limits<int32_t>::min() ||
+        value > std::numeric_limits<int32_t>::max()) {
+      return {};
+    }
+    return IntImm(kInt32, value);
+  } else if (width == 64) {
+    return IntImm(kInt64, value);
+  } else {
+    LOG(FATAL) << "Unrecognized int scalar width: " << width;
+    return {};
+  }
+}
+
+// 2^15 * (1 + 1023/1024)
+// See https://en.wikipedia.org/wiki/Half-precision_floating-point_format
+constexpr double kMaxFloat16 = 65504.0;
+
+FloatImm ValueToFloatImm(double value, int width) {
+  if (width == 16) {
+    if (!std::isinf(value) && (value < -kMaxFloat16 || value > kMaxFloat16)) {
+      return {};
+    }
+    return FloatImm(kFloat16, value);
+  } else if (width == 32) {
+    if (!std::isinf(value) &&
+        (value < -std::numeric_limits<float>::max() || value > std::numeric_limits<float>::max())) {
+      return {};
+    }
+    return FloatImm(kFloat32, value);
+  } else if (width == 64) {
+    return FloatImm(kFloat64, value);
+  } else {
+    LOG(FATAL) << "Unrecognized float scalar width: " << width;
+    return {};
+  }
+}
+
+}  // namespace support
+}  // namespace tvm
diff --git a/src/support/scalars.h b/src/support/scalars.h
new file mode 100644
index 0000000000000..60b8fc40a8de3
--- /dev/null
+++ b/src/support/scalars.h
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/support/scalars.h
+ * \brief Helpers for converting between scalars in native, text, TIR immediate and NDArray forms.
+ */
+
+#ifndef TVM_SUPPORT_SCALARS_H_
+#define TVM_SUPPORT_SCALARS_H_
+
+#include <string>
+#include <utility>
+
+#include "tvm/ir/expr.h"
+#include "tvm/relay/expr.h"
+#include "tvm/runtime/ndarray.h"
+
+namespace tvm {
+namespace support {
+
+/*! \brief Returns true if a tensor of empty shape and given dtype is considered a Relay scalar. */
+bool IsSimpleScalarDtype(DataType dtype);
+
+/*! \brief Returns true if \p constant_node is a float/int/bool scalar. */
+bool IsSimpleScalar(const relay::ConstantNode* constant_node);
+
+/*! \brief Returns NDArray 'scalar' for given TIR immediate. */
+runtime::NDArray IntImmToNDArray(const IntImm& int_imm);
+runtime::NDArray FloatImmToNDArray(const FloatImm& float_imm);
+runtime::NDArray BoolToNDArray(bool value);
+
+/*! \brief Returns Relay literal text for NDArray 'scalar'. */
+std::string NDArrayScalarToString(const runtime::NDArray& data);
+
+/*! \brief Returns Relay literal text for given TIR immediate. */
+std::string IntImmToString(const IntImm& int_imm);
+std::string FloatImmToString(const FloatImm& float_imm);
+
+/*!
+ * \brief Returns TIR immediate for given value and width. Result will be null if value is
+ * out of range in width. Note however for floating point we don't check if the value is
+ * representable without loss of precision.
+ */
+IntImm ValueToIntImm(int64_t value, int width);
+FloatImm ValueToFloatImm(double value, int width);
+
+}  // namespace support
+}  // namespace tvm
+
+#endif  // TVM_SUPPORT_SCALARS_H_
diff --git a/tests/cpp/support/scalars_test.cc b/tests/cpp/support/scalars_test.cc
new file mode 100644
index 0000000000000..d55f0541fa40b
--- /dev/null
+++ b/tests/cpp/support/scalars_test.cc
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "../../../src/support/scalars.h"
+
+#include <gtest/gtest.h>
+#include <tvm/relay/expr.h>
+
+namespace tvm {
+namespace support {
+namespace {
+
+// Note that functional testing is via test_ir_parser.py and test_ir_text_printer.py.
+// Here we just check handling which is difficult to test via the standard Python API.
+
+TEST(Scalars, IntImmToNDArray_Unsupported) {
+  ASSERT_THROW(IntImmToNDArray(IntImm(DataType::Int(15), 42)), runtime::InternalError);
+}
+
+TEST(Scalars, FloatImmtoNDArray_Unsupported) {
+  ASSERT_THROW(FloatImmToNDArray(FloatImm(DataType::Float(15), 42.0)), runtime::InternalError);
+}
+
+TEST(Scalars, NDArrayScalarToString_Unsupported) {
+  auto ndarray = runtime::NDArray::Empty({}, DataType::Int(8), {DLDeviceType::kDLCPU, 0});
+  ASSERT_THROW(NDArrayScalarToString(ndarray), runtime::InternalError);
+}
+
+TEST(Scalars, IntImmToString_Unsupported) {
+  ASSERT_THROW(IntImmToString(IntImm(DataType::Int(15), 42)), runtime::InternalError);
+}
+
+TEST(Scalars, FloatImmToString_Unsupported) {
+  ASSERT_THROW(FloatImmToString(FloatImm(DataType::Float(15), 42.0)), runtime::InternalError);
+}
+
+TEST(Scalars, ValueToIntImm_Unsupported) {
+  ASSERT_THROW(ValueToIntImm(42, 15), runtime::InternalError);
+}
+
+TEST(SCalars, ValueToFloatImm_Unsupported) {
+  ASSERT_THROW(ValueToFloatImm(42.0, 15), runtime::InternalError);
+}
+
+}  // namespace
+}  // namespace support
+}  // namespace tvm
diff --git a/tests/python/relay/test_ir_parser.py b/tests/python/relay/test_ir_parser.py
index fdbd3924ffb7f..7a283461e0bda 100644
--- a/tests/python/relay/test_ir_parser.py
+++ b/tests/python/relay/test_ir_parser.py
@@ -15,11 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 import numpy as np
+import pytest
 
 import tvm
 from tvm import relay
 import tvm.relay.testing
-import pytest
 from numpy import isclose
 from typing import Union
 
@@ -172,6 +172,26 @@ def test_int_literal():
     assert get_scalar(parse_text("-05")) == -5
     assert get_scalar(parse_text("9223372036854775807")) == 9223372036854775807
 
+    assert get_scalar(parse_text("-42i")) == -42
+    assert get_scalar(parse_text("-42i16")) == -42
+    assert get_scalar(parse_text("-42i32")) == -42
+    assert get_scalar(parse_text("-42i64")) == -42
+
+    assert_parses_as("-42i16", relay.const(-42, "int16"))
+    assert_parses_as("-42i32", relay.const(-42, "int32"))
+    assert_parses_as("-42i", relay.const(-42, "int32"))
+    assert_parses_as("-42", relay.const(-42, "int32"))
+    assert_parses_as("-42i64", relay.const(-42, "int64"))
+    assert_parses_as("2147483647", relay.const(2147483647, "int32"))
+    assert_parses_as("2147483648", relay.const(2147483648, "int64"))
+
+    with pytest.raises(tvm.error.DiagnosticError):
+        # Unrepresentable
+        parse_text("2147483648i32")
+    with pytest.raises(tvm.error.DiagnosticError):
+        # Unrepresentable
+        parse_text("32768i16")
+
 
 def test_float_literal():
     assert get_scalar(parse_text("1.0f")) == 1.0
@@ -189,11 +209,28 @@ def test_float_literal():
     assert isclose(get_scalar(parse_text("1.0E-1f")), 1.0e-1)
     assert get_scalar(parse_text("1.0E+1f")) == 1.0e1
 
+    assert get_scalar(parse_text("3f16")) == 3.0
+    assert get_scalar(parse_text("3f32")) == 3.0
+
+    assert_parses_as("3f16", relay.const(3.0, "float16"))
+    assert_parses_as("3f32", relay.const(3.0, "float32"))
+    assert_parses_as("3f", relay.const(3.0, "float32"))
+    assert_parses_as("3f64", relay.const(3.0, "float64"))
+
+    with pytest.raises(tvm.error.DiagnosticError):
+        # Unrepresentable
+        parse_text("3.40283e+38f32")
+    with pytest.raises(tvm.error.DiagnosticError):
+        # Unrepresentable
+        parse_text("65505f16")
+
 
 def test_bool_literal():
     assert get_scalar(parse_text("True")) == True
     assert get_scalar(parse_text("False")) == False
 
+    assert_parses_as("True", relay.const(True, "bool"))
+
 
 def test_negative():
     # need to handle parsing non-literal operations
@@ -993,4 +1030,4 @@ def @main(%x: Tensor[(2, 3), float32]) {
 if __name__ == "__main__":
     import sys
 
-    pytest.main(sys.argv)
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
index 54e0e4c7ca441..60f611998649d 100644
--- a/tests/python/relay/test_ir_text_printer.py
+++ b/tests/python/relay/test_ir_text_printer.py
@@ -47,16 +47,28 @@ def show(text):
         print(text)
 
 
-# Commented due to weird memory allocation error
-# def test_large_graph():
-#    x = relay.var("x", shape=(3, 2))
-#    y = relay.var("y")
-#    one = relay.const(10e10, dtype="float32")
-#    z = relay.add(x, one)
-#    for i in range(int(9e5)):
-#        z = relay.add(z, one)
-#    f = relay.Function([x, y], z)
-#    show(astext(f))
+def assert_prints_as(expr, str):
+    assert astext(expr) == SEMVER + str
+
+
+def test_scalars():
+    assert_prints_as(relay.const(42, "int16"), "42i16")
+    assert_prints_as(relay.const(42, "int32"), "42")
+    assert_prints_as(relay.const(42, "int64"), "42i64")
+    assert_prints_as(relay.const(3.0, "float16"), "3f16")
+    assert_prints_as(relay.const(3.0, "float32"), "3f")
+    assert_prints_as(relay.const(3.0, "float64"), "3f64")
+
+
+def test_large_graph():
+    x = relay.var("x", shape=(3, 2))
+    y = relay.var("y")
+    one = relay.const(10e10, dtype="float32")
+    z = relay.add(x, one)
+    for i in range(int(9e4)):
+        z = relay.add(z, one)
+    f = relay.Function([x, y], z)
+    show(astext(f))
 
 
 def test_func():
@@ -295,4 +307,7 @@ def test_slash_in_identifier():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    import sys
+    import pytest
+
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 16c4faf86c584b22dbeaf304108cee5103ac23c2 Mon Sep 17 00:00:00 2001
From: Altan Haan <3124994+altanh@users.noreply.github.com>
Date: Thu, 19 May 2022 09:15:18 -0700
Subject: [PATCH 34/59] nn.batch_flatten is a reshape op (#11367)

---
 src/relay/op/nn/nn.cc | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 27f295b8b39dc..234cafdca1502 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -540,10 +540,12 @@ Example::
     .add_argument("data", "Tensor", "The input tensor.")
     .set_support_level(2)
     .add_type_rel("BatchFlatten", BatchFlattenRel)
-    .set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs, const Array<te::Tensor>& inputs,
-                                             const Type& out_type) {
-      return Array<te::Tensor>{topi::nn::flatten(inputs[0])};
-    });
+    .set_attr<FTVMCompute>("FTVMCompute",
+                           [](const Attrs& attrs, const Array<te::Tensor>& inputs,
+                              const Type& out_type) {
+                             return Array<te::Tensor>{topi::nn::flatten(inputs[0])};
+                           })
+    .set_attr<TReshapeOp>("TReshapeOp", true);
 
 // relu
 TVM_REGISTER_GLOBAL("relay.op.nn._make.relu").set_body_typed([](Expr data) {

From 8d0da24f12bdccd8b7d0d953c1280142c8600b4d Mon Sep 17 00:00:00 2001
From: Farshid Salemi Parizi <fparizi@octoml.ai>
Date: Thu, 19 May 2022 09:40:01 -0700
Subject: [PATCH 35/59] [Hexagon] moves conftest.py to tvm.contrib.hexagon so
 outside repos can access the testing fixtures (#11277)

* adding pytest_plugin to python so other repos can access

* import requires_hexagon_toolchain from tvm.contrib.hexagon.pytest_plugin
---
 python/tvm/contrib/hexagon/pytest_plugin.py   | 236 ++++++++++++++++++
 tests/python/contrib/test_hexagon/conftest.py | 212 +---------------
 .../test_hexagon/test_2d_physical_buffers.py  |   2 +-
 .../python/contrib/test_hexagon/test_usmp.py  |   2 +-
 4 files changed, 242 insertions(+), 210 deletions(-)
 create mode 100644 python/tvm/contrib/hexagon/pytest_plugin.py

diff --git a/python/tvm/contrib/hexagon/pytest_plugin.py b/python/tvm/contrib/hexagon/pytest_plugin.py
new file mode 100644
index 0000000000000..2c62a0a0b5694
--- /dev/null
+++ b/python/tvm/contrib/hexagon/pytest_plugin.py
@@ -0,0 +1,236 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=invalid-name,redefined-outer-name
+""" Hexagon testing fixtures used to deduce testing argument
+    values from testing parameters """
+
+import os
+import random
+from typing import Optional, Union
+
+import pytest
+
+import tvm
+import tvm.rpc.tracker
+from tvm.contrib.hexagon.build import HexagonLauncher, HexagonLauncherRPC
+from tvm.contrib.hexagon.session import Session
+
+HEXAGON_TOOLCHAIN = "HEXAGON_TOOLCHAIN"
+TVM_TRACKER_HOST = "TVM_TRACKER_HOST"
+TVM_TRACKER_PORT = "TVM_TRACKER_PORT"
+ANDROID_REMOTE_DIR = "ANDROID_REMOTE_DIR"
+ANDROID_SERIAL_NUMBER = "ANDROID_SERIAL_NUMBER"
+ADB_SERVER_SOCKET = "ADB_SERVER_SOCKET"
+
+
+@tvm.testing.fixture
+def shape_nhwc(batch, in_channel, in_size):
+    return (batch, in_size, in_size, in_channel)
+
+
+def _compose(args, decs):
+    """Helper to apply multiple markers"""
+    if len(args) > 0:
+        func = args[0]
+        for dec in reversed(decs):
+            func = dec(func)
+        return func
+    return decs
+
+
+def requires_hexagon_toolchain(*args):
+    _requires_hexagon_toolchain = [
+        pytest.mark.skipif(
+            os.environ.get(HEXAGON_TOOLCHAIN) is None,
+            reason=f"Missing environment variable {HEXAGON_TOOLCHAIN}.",
+        ),
+    ]
+
+    return _compose(args, _requires_hexagon_toolchain)
+
+
+@tvm.testing.fixture
+def android_serial_number() -> Optional[str]:
+    serial = os.getenv(ANDROID_SERIAL_NUMBER, default="")
+    # Setting ANDROID_SERIAL_NUMBER to an empty string should be
+    # equivalent to having it unset.
+    if not serial.strip():
+        serial = None
+    return serial
+
+
+# NOTE on server ports:
+# These tests use different port numbers for the RPC server (7070 + ...).
+# The reason is that an RPC session cannot be gracefully closed without
+# triggering TIME_WAIT state on the server socket. This prevents another
+# server to bind to the same port until the wait time elapses.
+
+LISTEN_PORT_MIN = 2000  # Well above the privileged ports (1024 or lower)
+LISTEN_PORT_MAX = 9000  # Below the search range end (port_end=9199) of RPC server
+PREVIOUS_PORT = None
+
+
+def get_free_port() -> int:
+    """Return the next port that is available to listen on"""
+    global PREVIOUS_PORT
+    if PREVIOUS_PORT is None:
+        port = random.randint(LISTEN_PORT_MIN, LISTEN_PORT_MAX)
+    else:
+        port = PREVIOUS_PORT + 1
+
+    while tvm.contrib.hexagon.build._is_port_in_use(port):
+        port = port + 1 if port < LISTEN_PORT_MAX else LISTEN_PORT_MIN
+
+    PREVIOUS_PORT = port
+    return port
+
+
+@pytest.fixture(scope="session")
+def _tracker_info() -> Union[str, int]:
+    env_tracker_host = os.getenv(TVM_TRACKER_HOST, default="")
+    env_tracker_port = os.getenv(TVM_TRACKER_PORT, default="")
+
+    if env_tracker_host or env_tracker_port:
+        # A tracker is already running, and we should connect to it
+        # when running tests.
+        assert env_tracker_host, "TVM_TRACKER_PORT is defined, but TVM_TRACKER_HOST is not"
+        assert env_tracker_port, "TVM_TRACKER_HOST is defined, but TVM_TRACKER_PORT is not"
+        env_tracker_port = int(env_tracker_port)
+
+        try:
+            tvm.rpc.connect_tracker(env_tracker_host, env_tracker_port)
+        except RuntimeError as exc:
+            message = (
+                "Could not connect to external tracker "
+                "specified by $TVM_TRACKER_HOST and $TVM_TRACKER_PORT "
+                f"({env_tracker_host}:{env_tracker_port})"
+            )
+            raise RuntimeError(message) from exc
+
+        yield (env_tracker_host, env_tracker_port)
+
+    else:
+        # No tracker is provided to the tests, so we should start one
+        # for the tests to use.
+        tracker = tvm.rpc.tracker.Tracker("127.0.0.1", get_free_port())
+        try:
+            yield (tracker.host, tracker.port)
+        finally:
+            tracker.terminate()
+
+
+@pytest.fixture(scope="session")
+def tvm_tracker_host(_tracker_info) -> str:
+    host, _ = _tracker_info
+    return host
+
+
+@pytest.fixture(scope="session")
+def tvm_tracker_port(_tracker_info) -> int:
+    _, port = _tracker_info
+    return port
+
+
+@tvm.testing.fixture
+def rpc_server_port() -> int:
+    return get_free_port()
+
+
+@tvm.testing.fixture
+def adb_server_socket() -> str:
+    return os.getenv(ADB_SERVER_SOCKET, default="tcp:5037")
+
+
+@tvm.testing.fixture
+def hexagon_launcher(
+    request, android_serial_number, rpc_server_port, adb_server_socket
+) -> HexagonLauncherRPC:
+    """Initials and returns hexagon launcher if ANDROID_SERIAL_NUMBER is defined"""
+    if android_serial_number is None:
+        yield None
+    else:
+        # Requesting these fixtures sets up a local tracker, if one
+        # hasn't been provided to us.  Delaying the evaluation of
+        # these fixtures avoids starting a tracker unless necessary.
+        tvm_tracker_host = request.getfixturevalue("tvm_tracker_host")
+        tvm_tracker_port = request.getfixturevalue("tvm_tracker_port")
+
+        rpc_info = {
+            "rpc_tracker_host": tvm_tracker_host,
+            "rpc_tracker_port": tvm_tracker_port,
+            "rpc_server_port": rpc_server_port,
+            "adb_server_socket": adb_server_socket,
+        }
+        launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
+        launcher.start_server()
+        try:
+            yield launcher
+        finally:
+            launcher.stop_server()
+
+
+@tvm.testing.fixture
+def hexagon_session(hexagon_launcher) -> Session:
+    if hexagon_launcher is None:
+        yield None
+    else:
+        with hexagon_launcher.start_session() as session:
+            yield session
+
+
+# If the execution aborts while an RPC server is running, the python
+# code that is supposed to shut it down will never execute. This will
+# keep pytest from terminating (indefinitely), so add a cleanup
+# fixture to terminate any still-running servers.
+@pytest.fixture(scope="session", autouse=True)
+def terminate_rpc_servers():
+    # Since this is a fixture that runs regardless of whether the
+    # execution happens on simulator or on target, make sure the
+    # yield happens every time.
+    serial = os.environ.get(ANDROID_SERIAL_NUMBER)
+    yield []
+    if serial == "simulator":
+        os.system("ps ax | grep tvm_rpc_x86 | awk '{print $1}' | xargs kill")
+
+
+aot_host_target = tvm.testing.parameter(
+    "c",
+    "llvm -keys=hexagon -link-params=0 "
+    "-mattr=+hvxv68,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp "
+    "-mcpu=hexagonv68 -mtriple=hexagon",
+)
+
+
+@tvm.testing.fixture
+def aot_target(aot_host_target):
+    if aot_host_target == "c":
+        yield tvm.target.hexagon("v68")
+    elif aot_host_target.startswith("llvm"):
+        yield aot_host_target
+    else:
+        assert False, "Incorrect AoT host target: {aot_host_target}. Options are [c, llvm]."
+
+
+def pytest_addoption(parser):
+    parser.addoption("--gtest_args", action="store", default="")
+
+
+def pytest_generate_tests(metafunc):
+    option_value = metafunc.config.option.gtest_args
+    if "gtest_args" in metafunc.fixturenames and option_value is not None:
+        metafunc.parametrize("gtest_args", [option_value])
diff --git a/tests/python/contrib/test_hexagon/conftest.py b/tests/python/contrib/test_hexagon/conftest.py
index f76181e06d0eb..3b057384df372 100644
--- a/tests/python/contrib/test_hexagon/conftest.py
+++ b/tests/python/contrib/test_hexagon/conftest.py
@@ -18,216 +18,12 @@
 """ Hexagon testing fixtures used to deduce testing argument
     values from testing parameters """
 
-import os
-import random
-import socket
-from typing import Optional, Union
 
 import pytest
 
 import tvm
-import tvm.rpc.tracker
-from tvm.contrib.hexagon.build import HexagonLauncher, HexagonLauncherRPC
-from tvm.contrib.hexagon.session import Session
+import tvm.testing
 
-HEXAGON_TOOLCHAIN = "HEXAGON_TOOLCHAIN"
-TVM_TRACKER_HOST = "TVM_TRACKER_HOST"
-TVM_TRACKER_PORT = "TVM_TRACKER_PORT"
-ANDROID_REMOTE_DIR = "ANDROID_REMOTE_DIR"
-ANDROID_SERIAL_NUMBER = "ANDROID_SERIAL_NUMBER"
-ADB_SERVER_SOCKET = "ADB_SERVER_SOCKET"
-
-
-@tvm.testing.fixture
-def shape_nhwc(batch, in_channel, in_size):
-    return (batch, in_size, in_size, in_channel)
-
-
-def _compose(args, decs):
-    """Helper to apply multiple markers"""
-    if len(args) > 0:
-        f = args[0]
-        for d in reversed(decs):
-            f = d(f)
-        return f
-    return decs
-
-
-def requires_hexagon_toolchain(*args):
-    _requires_hexagon_toolchain = [
-        pytest.mark.skipif(
-            os.environ.get(HEXAGON_TOOLCHAIN) == None,
-            reason=f"Missing environment variable {HEXAGON_TOOLCHAIN}.",
-        ),
-    ]
-
-    return _compose(args, _requires_hexagon_toolchain)
-
-
-@tvm.testing.fixture
-def android_serial_number() -> Optional[str]:
-    serial = os.getenv(ANDROID_SERIAL_NUMBER, default="")
-    # Setting ANDROID_SERIAL_NUMBER to an empty string should be
-    # equivalent to having it unset.
-    if not serial.strip():
-        serial = None
-    return serial
-
-
-# NOTE on server ports:
-# These tests use different port numbers for the RPC server (7070 + ...).
-# The reason is that an RPC session cannot be gracefully closed without
-# triggering TIME_WAIT state on the server socket. This prevents another
-# server to bind to the same port until the wait time elapses.
-
-listen_port_min = 2000  # Well above the privileged ports (1024 or lower)
-listen_port_max = 9000  # Below the search range end (port_end=9199) of RPC server
-previous_port = None
-
-
-def get_free_port() -> int:
-
-    global previous_port
-    if previous_port is None:
-        port = random.randint(listen_port_min, listen_port_max)
-    else:
-        port = previous_port + 1
-
-    while tvm.contrib.hexagon.build._is_port_in_use(port):
-        port = port + 1 if port < listen_port_max else listen_port_min
-
-    previous_port = port
-    return port
-
-
-@pytest.fixture(scope="session")
-def _tracker_info() -> Union[str, int]:
-    env_tracker_host = os.getenv(TVM_TRACKER_HOST, default="")
-    env_tracker_port = os.getenv(TVM_TRACKER_PORT, default="")
-
-    if env_tracker_host or env_tracker_port:
-        # A tracker is already running, and we should connect to it
-        # when running tests.
-        assert env_tracker_host, "TVM_TRACKER_PORT is defined, but TVM_TRACKER_HOST is not"
-        assert env_tracker_port, "TVM_TRACKER_HOST is defined, but TVM_TRACKER_PORT is not"
-        env_tracker_port = int(env_tracker_port)
-
-        try:
-            tvm.rpc.connect_tracker(env_tracker_host, env_tracker_port)
-        except RuntimeError as exc:
-            message = (
-                "Could not connect to external tracker "
-                "specified by $TVM_TRACKER_HOST and $TVM_TRACKER_PORT "
-                f"({env_tracker_host}:{env_tracker_port})"
-            )
-            raise RuntimeError(message) from exc
-
-        yield (env_tracker_host, env_tracker_port)
-
-    else:
-        # No tracker is provided to the tests, so we should start one
-        # for the tests to use.
-        tracker = tvm.rpc.tracker.Tracker("127.0.0.1", get_free_port())
-        try:
-            yield (tracker.host, tracker.port)
-        finally:
-            tracker.terminate()
-
-
-@pytest.fixture(scope="session")
-def tvm_tracker_host(_tracker_info) -> str:
-    host, port = _tracker_info
-    return host
-
-
-@pytest.fixture(scope="session")
-def tvm_tracker_port(_tracker_info) -> int:
-    host, port = _tracker_info
-    return port
-
-
-@tvm.testing.fixture
-def rpc_server_port() -> int:
-    return get_free_port()
-
-
-@tvm.testing.fixture
-def adb_server_socket() -> str:
-    return os.getenv(ADB_SERVER_SOCKET, default="tcp:5037")
-
-
-@tvm.testing.fixture
-def hexagon_launcher(
-    request, android_serial_number, rpc_server_port, adb_server_socket
-) -> HexagonLauncherRPC:
-    if android_serial_number is None:
-        yield None
-    else:
-        # Requesting these fixtures sets up a local tracker, if one
-        # hasn't been provided to us.  Delaying the evaluation of
-        # these fixtures avoids starting a tracker unless necessary.
-        tvm_tracker_host = request.getfixturevalue("tvm_tracker_host")
-        tvm_tracker_port = request.getfixturevalue("tvm_tracker_port")
-
-        rpc_info = {
-            "rpc_tracker_host": tvm_tracker_host,
-            "rpc_tracker_port": tvm_tracker_port,
-            "rpc_server_port": rpc_server_port,
-            "adb_server_socket": adb_server_socket,
-        }
-        launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
-        launcher.start_server()
-        try:
-            yield launcher
-        finally:
-            launcher.stop_server()
-
-
-@tvm.testing.fixture
-def hexagon_session(hexagon_launcher) -> Session:
-    if hexagon_launcher is None:
-        yield None
-    else:
-        with hexagon_launcher.start_session() as session:
-            yield session
-
-
-# If the execution aborts while an RPC server is running, the python
-# code that is supposed to shut it dowm will never execute. This will
-# keep pytest from terminating (indefinitely), so add a cleanup
-# fixture to terminate any still-running servers.
-@pytest.fixture(scope="session", autouse=True)
-def terminate_rpc_servers():
-    # Since this is a fixture that runs regardless of whether the
-    # execution happens on simulator or on target, make sure the
-    # yield happens every time.
-    serial = os.environ.get(ANDROID_SERIAL_NUMBER)
-    yield []
-    if serial == "simulator":
-        os.system("ps ax | grep tvm_rpc_x86 | awk '{print $1}' | xargs kill")
-
-
-aot_host_target = tvm.testing.parameter(
-    "c",
-    "llvm -keys=hexagon -link-params=0 -mattr=+hvxv68,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp -mcpu=hexagonv68 -mtriple=hexagon",
-)
-
-
-@tvm.testing.fixture
-def aot_target(aot_host_target):
-    if aot_host_target == "c":
-        yield tvm.target.hexagon("v68")
-    elif aot_host_target.startswith("llvm"):
-        yield aot_host_target
-    else:
-        assert False, "Incorrect AoT host target: {aot_host_target}. Options are [c, llvm]."
-
-
-def pytest_addoption(parser):
-    parser.addoption("--gtest_args", action="store", default="")
-
-
-def pytest_generate_tests(metafunc):
-    option_value = metafunc.config.option.gtest_args
-    if "gtest_args" in metafunc.fixturenames and option_value is not None:
-        metafunc.parametrize("gtest_args", [option_value])
+pytest_plugins = [
+    "tvm.contrib.hexagon.pytest_plugin",
+]
diff --git a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
index 78e1eb11ad9fd..787d71fa17132 100644
--- a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
+++ b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
@@ -29,7 +29,7 @@
 from tvm.tir.stmt_functor import post_order_visit
 from tvm.contrib.hexagon.build import HexagonLauncher
 
-from .conftest import requires_hexagon_toolchain
+from tvm.contrib.hexagon.pytest_plugin import requires_hexagon_toolchain
 from .infrastructure import allocate_hexagon_array
 
 # Needed to register the link_shared packedfunc.
diff --git a/tests/python/contrib/test_hexagon/test_usmp.py b/tests/python/contrib/test_hexagon/test_usmp.py
index 116ecb4154ddb..03badfb655d96 100644
--- a/tests/python/contrib/test_hexagon/test_usmp.py
+++ b/tests/python/contrib/test_hexagon/test_usmp.py
@@ -26,7 +26,7 @@
 from tvm.contrib.hexagon.session import Session
 from tvm.testing.usmp import is_tvm_backendallocworkspace_calls
 
-from .conftest import requires_hexagon_toolchain
+from tvm.contrib.hexagon.pytest_plugin import requires_hexagon_toolchain
 
 usmp_enabled = tvm.testing.parameter(False, True)
 

From cd269101b7c508f5432ad4aee3c1ff8d07a89142 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 19 May 2022 13:52:18 -0700
Subject: [PATCH 36/59] [ci] Use S3 for artifacts (#11349)

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile            | 604 +++++++++++++++++++++++++++++++++++------
 jenkins/Jenkinsfile.j2 | 125 ++++-----
 jenkins/macros.j2      |  32 +++
 3 files changed, 598 insertions(+), 163 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 424f97494d767..024b920ac676e 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-17T17:26:21.660243
+// Generated at 2022-05-19T11:41:58.421857
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -482,53 +482,9 @@ def make(docker_type, path, make_flag) {
   }
 }
 
-// Specifications to Jenkins "stash" command for use with various pack_ and unpack_ functions.
-tvm_runtime = 'build/libtvm_runtime.so, build/config.cmake'  // use libtvm_runtime.so.
-tvm_lib = 'build/libtvm.so, ' + tvm_runtime  // use libtvm.so to run the full compiler.
-// LLVM upstream lib
-tvm_multilib = 'build/libtvm.so, ' +
-               'build/libvta_fsim.so, ' +
-               tvm_runtime
-
-tvm_multilib_tsim = 'build/libvta_tsim.so, ' +
-                    tvm_multilib
-
-microtvm_tar_gz = 'build/microtvm_template_projects.tar.gz'
-
-// pack libraries for later use
-def pack_lib(name, libs) {
-  sh (script: """
-     echo "Packing ${libs} into ${name}"
-     echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
-     """, label: 'Stash libraries and show md5')
-  stash includes: libs, name: name
-}
+// Filenames for stashing between build and test steps
+s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
 
-// unpack libraries saved before
-def unpack_lib(name, libs) {
-  unstash name
-  sh (script: """
-     echo "Unpacked ${libs} from ${name}"
-     echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
-     """, label: 'Unstash libraries and show md5')
-}
-
-// compress microtvm template projects and pack the tar.
-def pack_microtvm_template_projects(name) {
-  sh(
-    script: 'cd build && tar -czvf microtvm_template_projects.tar.gz microtvm_template_projects/',
-    label: 'Compress microtvm_template_projects'
-  )
-  pack_lib(name + '-microtvm-libs', microtvm_tar_gz)
-}
-
-def unpack_microtvm_template_projects(name) {
-  unpack_lib(name + '-microtvm-libs', microtvm_tar_gz)
-  sh(
-    script: 'cd build && tar -xzvf microtvm_template_projects.tar.gz',
-    label: 'Unpack microtvm_template_projects'
-  )
-}
 
 def ci_setup(image) {
   sh (
@@ -565,24 +521,63 @@ def cpp_unittest(image) {
   )
 }
 
+
+def add_microtvm_permissions() {
+  sh(
+    script: 'find build/microtvm_template_projects -type f | xargs chmod +x',
+    label: 'Add execute permissions for microTVM files',
+  )
+}
+
+
 def build() {
 stage('Build') {
   environment {
     SKIP_SLOW_TESTS = "${skip_slow_tests}"
   }
-  parallel 'BUILD: GPU': {
+  parallel(
+    'BUILD: GPU': {
     if (!skip_ci) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-gpu") {
           init_git()
           sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
           make("${ci_gpu} --no-gpu", 'build', '-j2')
-          pack_lib('gpu', tvm_multilib)
-          pack_microtvm_template_projects('gpu')
+          sh(
+            script: """
+              set -eux
+              md5sum build/libtvm.so
+              aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/gpu/build/libtvm.so
+              md5sum build/libvta_fsim.so
+              aws s3 cp --no-progress build/libvta_fsim.so s3://${s3_prefix}/gpu/build/libvta_fsim.so
+              md5sum build/libtvm_runtime.so
+              aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/gpu/build/libtvm_runtime.so
+              md5sum build/config.cmake
+              aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/gpu/build/config.cmake
+              aws s3 cp --no-progress build/microtvm_template_projects s3://${s3_prefix}/gpu/build/microtvm_template_projects --recursive
+            """,
+            label: 'Upload artifacts to S3',
+          )
+
+
           // compiler test
           sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build2"
           make("${ci_gpu} --no-gpu", 'build2', '-j2')
-          pack_lib('gpu2', tvm_multilib)
+          sh(
+            script: """
+              set -eux
+              md5sum build/libtvm.so
+              aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/gpu2/build/libtvm.so
+              md5sum build/libvta_fsim.so
+              aws s3 cp --no-progress build/libvta_fsim.so s3://${s3_prefix}/gpu2/build/libvta_fsim.so
+              md5sum build/libtvm_runtime.so
+              aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/gpu2/build/libtvm_runtime.so
+              md5sum build/config.cmake
+              aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/gpu2/build/config.cmake
+            """,
+            label: 'Upload artifacts to S3',
+          )
+
         }
       }
     }
@@ -597,7 +592,23 @@ stage('Build') {
             label: 'Create CPU cmake config',
           )
           make(ci_cpu, 'build', '-j2')
-          pack_lib('cpu', tvm_multilib_tsim)
+          sh(
+            script: """
+              set -eux
+              md5sum build/libvta_tsim.so
+              aws s3 cp --no-progress build/libvta_tsim.so s3://${s3_prefix}/cpu/build/libvta_tsim.so
+              md5sum build/libtvm.so
+              aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/cpu/build/libtvm.so
+              md5sum build/libvta_fsim.so
+              aws s3 cp --no-progress build/libvta_fsim.so s3://${s3_prefix}/cpu/build/libvta_fsim.so
+              md5sum build/libtvm_runtime.so
+              aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/cpu/build/libtvm_runtime.so
+              md5sum build/config.cmake
+              aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/cpu/build/config.cmake
+            """,
+            label: 'Upload artifacts to S3',
+          )
+
           timeout(time: max_time, unit: 'MINUTES') {
             ci_setup(ci_cpu)
             // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
@@ -644,7 +655,23 @@ stage('Build') {
             label: 'Create i386 cmake config',
           )
           make(ci_i386, 'build', '-j2')
-          pack_lib('i386', tvm_multilib_tsim)
+          sh(
+            script: """
+              set -eux
+              md5sum build/libvta_tsim.so
+              aws s3 cp --no-progress build/libvta_tsim.so s3://${s3_prefix}/i386/build/libvta_tsim.so
+              md5sum build/libtvm.so
+              aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/i386/build/libtvm.so
+              md5sum build/libvta_fsim.so
+              aws s3 cp --no-progress build/libvta_fsim.so s3://${s3_prefix}/i386/build/libvta_fsim.so
+              md5sum build/libtvm_runtime.so
+              aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/i386/build/libtvm_runtime.so
+              md5sum build/config.cmake
+              aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/i386/build/config.cmake
+            """,
+            label: 'Upload artifacts to S3',
+          )
+
         }
       }
     } else {
@@ -661,7 +688,21 @@ stage('Build') {
             label: 'Create ARM cmake config',
           )
           make(ci_arm, 'build', '-j4')
-          pack_lib('arm', tvm_multilib)
+          sh(
+            script: """
+              set -eux
+              md5sum build/libtvm.so
+              aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/arm/build/libtvm.so
+              md5sum build/libvta_fsim.so
+              aws s3 cp --no-progress build/libvta_fsim.so s3://${s3_prefix}/arm/build/libvta_fsim.so
+              md5sum build/libtvm_runtime.so
+              aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/arm/build/libtvm_runtime.so
+              md5sum build/config.cmake
+              aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/arm/build/config.cmake
+            """,
+            label: 'Upload artifacts to S3',
+          )
+
         }
       }
      } else {
@@ -678,8 +719,20 @@ stage('Build') {
             label: 'Create QEMU cmake config',
           )
           make(ci_qemu, 'build', '-j2')
-          pack_lib('qemu', tvm_lib)
-          pack_microtvm_template_projects('qemu')
+          sh(
+            script: """
+              set -eux
+              md5sum build/libtvm.so
+              aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/qemu/build/libtvm.so
+              md5sum build/libtvm_runtime.so
+              aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/qemu/build/libtvm_runtime.so
+              md5sum build/config.cmake
+              aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/qemu/build/config.cmake
+              aws s3 cp --no-progress build/microtvm_template_projects s3://${s3_prefix}/qemu/build/microtvm_template_projects --recursive
+            """,
+            label: 'Upload artifacts to S3',
+          )
+
         }
       }
      } else {
@@ -696,13 +749,26 @@ stage('Build') {
             label: 'Create Hexagon cmake config',
           )
           make(ci_hexagon, 'build', '-j2')
-          pack_lib('hexagon', tvm_lib)
+          sh(
+            script: """
+              set -eux
+              md5sum build/libtvm.so
+              aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/hexagon/build/libtvm.so
+              md5sum build/libtvm_runtime.so
+              aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/hexagon/build/libtvm_runtime.so
+              md5sum build/config.cmake
+              aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/hexagon/build/config.cmake
+            """,
+            label: 'Upload artifacts to S3',
+          )
+
         }
       }
      } else {
       Utils.markStageSkippedForConditional('BUILD: Hexagon')
     }
-  }
+  },
+  )
 }
 }
 
@@ -726,10 +792,38 @@ stage('Test') {
                 'PLATFORM=gpu',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=0'], {
-                unpack_lib('gpu2', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu2/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu2/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu2/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu2/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 cpp_unittest(ci_gpu)
 
-                unpack_lib('gpu', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_gpu)
                 cpp_unittest(ci_gpu)
                 sh (
@@ -762,7 +856,21 @@ stage('Test') {
                 'PLATFORM=gpu',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=1'], {
-                unpack_lib('gpu', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_gpu)
                 sh (
                   script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh",
@@ -798,7 +906,23 @@ stage('Test') {
                 'PLATFORM=cpu',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=0'], {
-                unpack_lib('cpu', tvm_multilib_tsim)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
+                          md5sum build/libvta_tsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_cpu)
                 sh (
                   script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
@@ -826,7 +950,23 @@ stage('Test') {
                 'PLATFORM=cpu',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=1'], {
-                unpack_lib('cpu', tvm_multilib_tsim)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
+                          md5sum build/libvta_tsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_cpu)
                 sh (
                   script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
@@ -851,7 +991,23 @@ stage('Test') {
             try {
               init_git()
               withEnv(['PLATFORM=cpu'], {
-                unpack_lib('cpu', tvm_multilib_tsim)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
+                          md5sum build/libvta_tsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_cpu)
                 cpp_unittest(ci_cpu)
                 python_unittest(ci_cpu)
@@ -882,7 +1038,21 @@ stage('Test') {
                 'PLATFORM=i386',
                 'TVM_NUM_SHARDS=3',
                 'TVM_SHARD_INDEX=0'], {
-                unpack_lib('i386', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_i386)
                 cpp_unittest(ci_i386)
                 python_unittest(ci_i386)
@@ -913,7 +1083,21 @@ stage('Test') {
                 'PLATFORM=i386',
                 'TVM_NUM_SHARDS=3',
                 'TVM_SHARD_INDEX=1'], {
-                unpack_lib('i386', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_i386)
                 python_unittest(ci_i386)
                 sh (
@@ -943,7 +1127,21 @@ stage('Test') {
                 'PLATFORM=i386',
                 'TVM_NUM_SHARDS=3',
                 'TVM_SHARD_INDEX=2'], {
-                unpack_lib('i386', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_i386)
                 python_unittest(ci_i386)
                 sh (
@@ -973,7 +1171,19 @@ stage('Test') {
                 'PLATFORM=hexagon',
                 'TVM_NUM_SHARDS=4',
                 'TVM_SHARD_INDEX=0'], {
-                unpack_lib('hexagon', tvm_lib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_hexagon)
                 cpp_unittest(ci_hexagon)
                 sh (
@@ -1006,7 +1216,19 @@ stage('Test') {
                 'PLATFORM=hexagon',
                 'TVM_NUM_SHARDS=4',
                 'TVM_SHARD_INDEX=1'], {
-                unpack_lib('hexagon', tvm_lib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_hexagon)
                 sh (
                   script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
@@ -1038,7 +1260,19 @@ stage('Test') {
                 'PLATFORM=hexagon',
                 'TVM_NUM_SHARDS=4',
                 'TVM_SHARD_INDEX=2'], {
-                unpack_lib('hexagon', tvm_lib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_hexagon)
                 sh (
                   script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
@@ -1070,7 +1304,19 @@ stage('Test') {
                 'PLATFORM=hexagon',
                 'TVM_NUM_SHARDS=4',
                 'TVM_SHARD_INDEX=3'], {
-                unpack_lib('hexagon', tvm_lib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_hexagon)
                 sh (
                   script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
@@ -1099,8 +1345,21 @@ stage('Test') {
             try {
               init_git()
               withEnv(['PLATFORM=qemu'], {
-                unpack_lib('qemu', tvm_lib)
-                unpack_microtvm_template_projects('qemu')
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/qemu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/qemu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/qemu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                          aws s3 cp --no-progress s3://${s3_prefix}/qemu/build/microtvm_template_projects build/microtvm_template_projects --recursive
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
+                add_microtvm_permissions()
                 ci_setup(ci_qemu)
                 cpp_unittest(ci_qemu)
                 sh (
@@ -1130,7 +1389,21 @@ stage('Test') {
             try {
               init_git()
               withEnv(['PLATFORM=arm'], {
-                unpack_lib('arm', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_arm)
                 cpp_unittest(ci_arm)
                 sh (
@@ -1163,7 +1436,21 @@ stage('Test') {
                 'PLATFORM=arm',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=0'], {
-                unpack_lib('arm', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_arm)
                 python_unittest(ci_arm)
                 sh (
@@ -1192,7 +1479,21 @@ stage('Test') {
                 'PLATFORM=arm',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=1'], {
-                unpack_lib('arm', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_arm)
                 python_unittest(ci_arm)
                 sh (
@@ -1221,7 +1522,21 @@ stage('Test') {
                 'PLATFORM=gpu',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=0'], {
-                unpack_lib('gpu', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_gpu)
                 sh (
                   script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
@@ -1249,7 +1564,21 @@ stage('Test') {
                 'PLATFORM=gpu',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=1'], {
-                unpack_lib('gpu', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_gpu)
                 sh (
                   script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
@@ -1277,7 +1606,21 @@ stage('Test') {
                 'PLATFORM=gpu',
                 'TVM_NUM_SHARDS=3',
                 'TVM_SHARD_INDEX=0'], {
-                unpack_lib('gpu', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_gpu)
                 sh (
                   script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
@@ -1305,7 +1648,21 @@ stage('Test') {
                 'PLATFORM=gpu',
                 'TVM_NUM_SHARDS=3',
                 'TVM_SHARD_INDEX=1'], {
-                unpack_lib('gpu', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_gpu)
                 sh (
                   script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
@@ -1333,7 +1690,21 @@ stage('Test') {
                 'PLATFORM=gpu',
                 'TVM_NUM_SHARDS=3',
                 'TVM_SHARD_INDEX=2'], {
-                unpack_lib('gpu', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_gpu)
                 sh (
                   script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
@@ -1358,7 +1729,21 @@ stage('Test') {
             try {
               init_git()
               withEnv(['PLATFORM=cpu'], {
-                unpack_lib('cpu', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_cpu)
                 sh (
                   script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh",
@@ -1383,7 +1768,21 @@ stage('Test') {
             try {
               init_git()
               withEnv(['PLATFORM=arm'], {
-                unpack_lib('arm', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_arm)
                 sh (
                   script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
@@ -1405,8 +1804,23 @@ stage('Test') {
       node('GPU') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/docs-python-gpu") {
           init_git()
-          unpack_lib('gpu', tvm_multilib)
-          unpack_microtvm_template_projects('gpu')
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+              md5sum build/libtvm.so
+              aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+              md5sum build/libvta_fsim.so
+              aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+              md5sum build/libtvm_runtime.so
+              aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+              md5sum build/config.cmake
+              aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/microtvm_template_projects build/microtvm_template_projects --recursive
+            """,
+            label: 'Download artifacts from S3',
+          )
+
+          add_microtvm_permissions()
           timeout(time: 180, unit: 'MINUTES') {
             ci_setup(ci_gpu)
             sh (
@@ -1414,7 +1828,15 @@ stage('Test') {
               label: 'Build docs',
             )
           }
-          pack_lib('docs', 'docs.tgz')
+          sh(
+            script: """
+              set -eux
+              md5sum docs.tgz
+              aws s3 cp --no-progress docs.tgz s3://${s3_prefix}/docs/docs.tgz
+            """,
+            label: 'Upload artifacts to S3',
+          )
+
           archiveArtifacts(artifacts: 'docs.tgz', fingerprint: true)
         }
       }
@@ -1489,7 +1911,15 @@ stage('Deploy') {
   if (env.BRANCH_NAME == 'main' && env.DOCS_DEPLOY_ENABLED == 'yes') {
     node('CPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/deploy-docs") {
-        unpack_lib('docs', 'docs.tgz')
+        sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress s3://${s3_prefix}/docs/docs.tgz docs.tgz
+              md5sum docs.tgz
+            """,
+            label: 'Download artifacts from S3',
+          )
+
         deploy_docs()
       }
     }
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index f250ff12feed0..8742d07244857 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -399,53 +399,14 @@ def make(docker_type, path, make_flag) {
   }
 }
 
-// Specifications to Jenkins "stash" command for use with various pack_ and unpack_ functions.
-tvm_runtime = 'build/libtvm_runtime.so, build/config.cmake'  // use libtvm_runtime.so.
-tvm_lib = 'build/libtvm.so, ' + tvm_runtime  // use libtvm.so to run the full compiler.
-// LLVM upstream lib
-tvm_multilib = 'build/libtvm.so, ' +
-               'build/libvta_fsim.so, ' +
-               tvm_runtime
-
-tvm_multilib_tsim = 'build/libvta_tsim.so, ' +
-                    tvm_multilib
-
-microtvm_tar_gz = 'build/microtvm_template_projects.tar.gz'
-
-// pack libraries for later use
-def pack_lib(name, libs) {
-  sh (script: """
-     echo "Packing ${libs} into ${name}"
-     echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
-     """, label: 'Stash libraries and show md5')
-  stash includes: libs, name: name
-}
-
-// unpack libraries saved before
-def unpack_lib(name, libs) {
-  unstash name
-  sh (script: """
-     echo "Unpacked ${libs} from ${name}"
-     echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
-     """, label: 'Unstash libraries and show md5')
-}
+// Filenames for stashing between build and test steps
+{% set tvm_runtime = ['build/libtvm_runtime.so', 'build/config.cmake'] %}
+{% set tvm_lib = ['build/libtvm.so'] + tvm_runtime %}
+{% set tvm_multilib = ['build/libtvm.so', 'build/libvta_fsim.so'] + tvm_runtime %}
+{% set tvm_multilib_tsim = ['build/libvta_tsim.so'] + tvm_multilib %}
+{% set microtvm_template_projects = ['build/microtvm_template_projects',] %}
+s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
 
-// compress microtvm template projects and pack the tar.
-def pack_microtvm_template_projects(name) {
-  sh(
-    script: 'cd build && tar -czvf microtvm_template_projects.tar.gz microtvm_template_projects/',
-    label: 'Compress microtvm_template_projects'
-  )
-  pack_lib(name + '-microtvm-libs', microtvm_tar_gz)
-}
-
-def unpack_microtvm_template_projects(name) {
-  unpack_lib(name + '-microtvm-libs', microtvm_tar_gz)
-  sh(
-    script: 'cd build && tar -xzvf microtvm_template_projects.tar.gz',
-    label: 'Unpack microtvm_template_projects'
-  )
-}
 
 def ci_setup(image) {
   sh (
@@ -482,24 +443,36 @@ def cpp_unittest(image) {
   )
 }
 
+
+def add_microtvm_permissions() {
+  {% for folder in microtvm_template_projects %}
+  sh(
+    script: 'find {{ folder }} -type f | xargs chmod +x',
+    label: 'Add execute permissions for microTVM files',
+  )
+  {% endfor %}
+}
+
+
 def build() {
 stage('Build') {
   environment {
     SKIP_SLOW_TESTS = "${skip_slow_tests}"
   }
-  parallel 'BUILD: GPU': {
+  parallel(
+    'BUILD: GPU': {
     if (!skip_ci) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-gpu') }}) {
           init_git()
           sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
           make("${ci_gpu} --no-gpu", 'build', '-j2')
-          pack_lib('gpu', tvm_multilib)
-          pack_microtvm_template_projects('gpu')
+          {{ m.upload_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
+
           // compiler test
           sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build2"
           make("${ci_gpu} --no-gpu", 'build2', '-j2')
-          pack_lib('gpu2', tvm_multilib)
+          {{ m.upload_artifacts(tag='gpu2', filenames=tvm_multilib) }}
         }
       }
     }
@@ -514,7 +487,7 @@ stage('Build') {
             label: 'Create CPU cmake config',
           )
           make(ci_cpu, 'build', '-j2')
-          pack_lib('cpu', tvm_multilib_tsim)
+          {{ m.upload_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
           timeout(time: max_time, unit: 'MINUTES') {
             ci_setup(ci_cpu)
             // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
@@ -561,7 +534,7 @@ stage('Build') {
             label: 'Create i386 cmake config',
           )
           make(ci_i386, 'build', '-j2')
-          pack_lib('i386', tvm_multilib_tsim)
+          {{ m.upload_artifacts(tag='i386', filenames=tvm_multilib_tsim) }}
         }
       }
     } else {
@@ -578,7 +551,7 @@ stage('Build') {
             label: 'Create ARM cmake config',
           )
           make(ci_arm, 'build', '-j4')
-          pack_lib('arm', tvm_multilib)
+          {{ m.upload_artifacts(tag='arm', filenames=tvm_multilib) }}
         }
       }
      } else {
@@ -595,8 +568,7 @@ stage('Build') {
             label: 'Create QEMU cmake config',
           )
           make(ci_qemu, 'build', '-j2')
-          pack_lib('qemu', tvm_lib)
-          pack_microtvm_template_projects('qemu')
+          {{ m.upload_artifacts(tag='qemu', filenames=tvm_lib, folders=microtvm_template_projects) }}
         }
       }
      } else {
@@ -613,13 +585,14 @@ stage('Build') {
             label: 'Create Hexagon cmake config',
           )
           make(ci_hexagon, 'build', '-j2')
-          pack_lib('hexagon', tvm_lib)
+          {{ m.upload_artifacts(tag='hexagon', filenames=tvm_lib) }}
         }
       }
      } else {
       Utils.markStageSkippedForConditional('BUILD: Hexagon')
     }
-  }
+  },
+  )
 }
 }
 
@@ -640,14 +613,14 @@ stage('Test') {
     platform="gpu",
   ) %}
     {% if shard_index == 1 %}
-    unpack_lib('gpu2', tvm_multilib)
+    {{ m.download_artifacts(tag='gpu2', filenames=tvm_multilib) }}
     cpp_unittest(ci_gpu)
 
-    unpack_lib('gpu', tvm_multilib)
+    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
     ci_setup(ci_gpu)
     cpp_unittest(ci_gpu)
     {% else %}
-    unpack_lib('gpu', tvm_multilib)
+    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
     ci_setup(ci_gpu)
     {% endif %}
     {% if shard_index == 2 or num_shards < 2 %}
@@ -672,7 +645,7 @@ stage('Test') {
       ws="tvm/integration-python-cpu",
       platform="cpu",
     ) %}
-    unpack_lib('cpu', tvm_multilib_tsim)
+    {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
     ci_setup(ci_cpu)
     sh (
       script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
@@ -685,7 +658,7 @@ stage('Test') {
     ws="tvm/ut-python-cpu",
     platform="cpu",
   ) %}
-    unpack_lib('cpu', tvm_multilib_tsim)
+    {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
     ci_setup(ci_cpu)
     cpp_unittest(ci_cpu)
     python_unittest(ci_cpu)
@@ -702,7 +675,7 @@ stage('Test') {
     ws="tvm/integration-python-i386",
     platform="i386",
   ) %}
-    unpack_lib('i386', tvm_multilib)
+    {{ m.download_artifacts(tag='i386', filenames=tvm_multilib) }}
     ci_setup(ci_i386)
     {% if shard_index == 1 %}
     cpp_unittest(ci_i386)
@@ -721,7 +694,7 @@ stage('Test') {
     platform="hexagon",
     num_shards=4,
   ) %}
-    unpack_lib('hexagon', tvm_lib)
+    {{ m.download_artifacts(tag='hexagon', filenames=tvm_lib) }}
     ci_setup(ci_hexagon)
     {% if shard_index == 1 %}
     cpp_unittest(ci_hexagon)
@@ -741,8 +714,8 @@ stage('Test') {
     ws="tvm/test-qemu",
     platform="qemu",
   ) %}
-    unpack_lib('qemu', tvm_lib)
-    unpack_microtvm_template_projects('qemu')
+    {{ m.download_artifacts(tag='qemu', filenames=tvm_lib, folders=microtvm_template_projects) }}
+    add_microtvm_permissions()
     ci_setup(ci_qemu)
     cpp_unittest(ci_qemu)
     sh (
@@ -760,7 +733,7 @@ stage('Test') {
     ws="tvm/ut-python-arm",
     platform="arm",
 ) %}
-    unpack_lib('arm', tvm_multilib)
+    {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
     ci_setup(ci_arm)
     cpp_unittest(ci_arm)
     sh (
@@ -778,7 +751,7 @@ stage('Test') {
     node="ARM", ws="tvm/ut-python-arm",
     platform="arm",
   ) %}
-    unpack_lib('arm', tvm_multilib)
+    {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
     ci_setup(ci_arm)
     python_unittest(ci_arm)
     sh (
@@ -793,7 +766,7 @@ stage('Test') {
     ws="tvm/topi-python-gpu",
     platform="gpu",
   ) %}
-    unpack_lib('gpu', tvm_multilib)
+    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
     ci_setup(ci_gpu)
     sh (
       script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
@@ -806,7 +779,7 @@ stage('Test') {
     ws="tvm/frontend-python-gpu",
     platform="gpu",
   ) %}
-    unpack_lib('gpu', tvm_multilib)
+    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
     ci_setup(ci_gpu)
     sh (
       script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
@@ -819,7 +792,7 @@ stage('Test') {
     ws="tvm/frontend-python-cpu",
     platform="cpu",
 ) %}
-    unpack_lib('cpu', tvm_multilib)
+    {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib) }}
     ci_setup(ci_cpu)
     sh (
       script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh",
@@ -832,7 +805,7 @@ stage('Test') {
     ws="tvm/frontend-python-arm",
     platform="arm",
 ) %}
-    unpack_lib('arm', tvm_multilib)
+    {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
     ci_setup(ci_arm)
     sh (
       script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
@@ -844,8 +817,8 @@ stage('Test') {
       node('GPU') {
         ws({{ m.per_exec_ws('tvm/docs-python-gpu') }}) {
           init_git()
-          unpack_lib('gpu', tvm_multilib)
-          unpack_microtvm_template_projects('gpu')
+          {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
+          add_microtvm_permissions()
           timeout(time: 180, unit: 'MINUTES') {
             ci_setup(ci_gpu)
             sh (
@@ -853,7 +826,7 @@ stage('Test') {
               label: 'Build docs',
             )
           }
-          pack_lib('docs', 'docs.tgz')
+          {{ m.upload_artifacts(tag='docs', filenames=["docs.tgz"]) }}
           archiveArtifacts(artifacts: 'docs.tgz', fingerprint: true)
         }
       }
@@ -928,7 +901,7 @@ stage('Deploy') {
   if (env.BRANCH_NAME == 'main' && env.DOCS_DEPLOY_ENABLED == 'yes') {
     node('CPU') {
       ws({{ m.per_exec_ws('tvm/deploy-docs') }}) {
-        unpack_lib('docs', 'docs.tgz')
+        {{ m.download_artifacts(tag='docs', filenames=["docs.tgz"]) }}
         deploy_docs()
       }
     }
diff --git a/jenkins/macros.j2 b/jenkins/macros.j2
index de33a203f603b..2ce005a128efb 100644
--- a/jenkins/macros.j2
+++ b/jenkins/macros.j2
@@ -89,3 +89,35 @@
     }
   },
 {% endmacro %}
+
+{% macro upload_artifacts(tag, filenames, folders=[]) %}
+sh(
+            script: """
+              set -eux
+              {% for filename in filenames %}
+              md5sum {{ filename }}
+              aws s3 cp --no-progress {{ filename }} s3://${s3_prefix}/{{ tag }}/{{ filename }}
+              {% endfor %}
+              {% for folder in (folders or []) %}
+              aws s3 cp --no-progress {{ folder }} s3://${s3_prefix}/{{ tag }}/{{ folder }} --recursive
+              {% endfor %}
+            """,
+            label: 'Upload artifacts to S3',
+          )
+{% endmacro %}
+
+{% macro download_artifacts(tag, filenames, folders=None) %}
+sh(
+            script: """
+              set -eux
+              {% for filename in filenames %}
+              aws s3 cp --no-progress s3://${s3_prefix}/{{ tag }}/{{ filename }} {{ filename }}
+              md5sum {{ filename }}
+              {% endfor %}
+              {% for folder in (folders or []) %}
+              aws s3 cp --no-progress s3://${s3_prefix}/{{ tag }}/{{ folder }} {{ folder }} --recursive
+              {% endfor %}
+            """,
+            label: 'Download artifacts from S3',
+          )
+{% endmacro %}

From 5e29dddd02193a440c18a1d98fef9023cb008788 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Thu, 19 May 2022 16:09:51 -0700
Subject: [PATCH 37/59] [microTVM][ARM] Add Relay tests for conv2d registered
 schedules (#11250)

* Added conv2d relay test for each schedule
* Enable relay tests in qemu
* split aot test utils
---
 python/tvm/autotvm/__init__.py                |   1 +
 python/tvm/autotvm/task/__init__.py           |   1 +
 python/tvm/autotvm/task/dispatcher.py         |  53 +++
 python/tvm/micro/testing/aot_test_utils.py    | 105 ++++
 .../micro/{testing.py => testing/utils.py}    |   0
 .../tvm/testing/aot.py                        | 450 ++++++++----------
 tests/micro/zephyr/test_utils.py              |   2 +-
 tests/micro/zephyr/test_zephyr.py             |   2 +-
 tests/micro/zephyr/test_zephyr_aot.py         |   1 -
 .../contrib/test_cmsisnn/test_binary_ops.py   |   6 +-
 .../contrib/test_cmsisnn/test_conv2d.py       |  10 +-
 .../test_cmsisnn/test_fully_connected.py      |   9 +-
 .../test_cmsisnn/test_invalid_graphs.py       |   7 +-
 .../contrib/test_cmsisnn/test_networks.py     |  10 +-
 .../contrib/test_cmsisnn/test_pooling.py      |  11 +-
 .../contrib/test_cmsisnn/test_softmax.py      |   9 +-
 tests/python/contrib/test_ethosu/infra.py     |   2 +-
 .../contrib/test_ethosu/test_codegen.py       |   2 +-
 .../contrib/test_ethosu/test_networks.py      |   2 +-
 .../integration/test_arm_mprofile_dsp.py      |   8 +-
 tests/python/relay/aot/test_c_device_api.py   |   8 +-
 tests/python/relay/aot/test_cpp_aot.py        |   3 +-
 tests/python/relay/aot/test_crt_aot.py        |   6 +-
 tests/python/relay/aot/test_crt_aot_usmp.py   |   5 +-
 .../strategy/arm_cpu/test_conv2d_nchw.py      | 110 +++++
 .../strategy/arm_cpu/test_conv2d_nhwc.py      | 154 ++++++
 .../strategy/arm_cpu/test_depthwise_conv2d.py | 153 ++++++
 .../strategy/arm_cpu/test_group_conv2d.py     | 151 ++++++
 tests/python/relay/utils/external_codegen.py  |   3 +-
 tests/python/unittest/test_crt.py             |   2 +-
 tests/scripts/task_python_microtvm.sh         |   2 +
 31 files changed, 961 insertions(+), 327 deletions(-)
 create mode 100644 python/tvm/micro/testing/aot_test_utils.py
 rename python/tvm/micro/{testing.py => testing/utils.py} (100%)
 rename tests/python/relay/aot/aot_test_utils.py => python/tvm/testing/aot.py (72%)
 create mode 100644 tests/python/relay/strategy/arm_cpu/test_conv2d_nchw.py
 create mode 100644 tests/python/relay/strategy/arm_cpu/test_conv2d_nhwc.py
 create mode 100644 tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
 create mode 100644 tests/python/relay/strategy/arm_cpu/test_group_conv2d.py

diff --git a/python/tvm/autotvm/__init__.py b/python/tvm/autotvm/__init__.py
index a3c59252b01a7..5a7d00960ecd8 100644
--- a/python/tvm/autotvm/__init__.py
+++ b/python/tvm/autotvm/__init__.py
@@ -60,5 +60,6 @@
     FallbackContext,
     ApplyHistoryBest as apply_history_best,
     ApplyGraphBest as apply_graph_best,
+    ApplyFixedConfig as apply_fixed_config,
 )
 from .env import GLOBAL_SCOPE
diff --git a/python/tvm/autotvm/task/__init__.py b/python/tvm/autotvm/task/__init__.py
index 6eea62264d7de..3949d324c4df5 100644
--- a/python/tvm/autotvm/task/__init__.py
+++ b/python/tvm/autotvm/task/__init__.py
@@ -36,6 +36,7 @@
 from .dispatcher import (
     DispatchContext,
     ApplyConfig,
+    ApplyFixedConfig,
     ApplyHistoryBest,
     FallbackContext,
     clear_fallback_cache,
diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py
index 6c072dc1fa17b..11a608d4cbbf8 100644
--- a/python/tvm/autotvm/task/dispatcher.py
+++ b/python/tvm/autotvm/task/dispatcher.py
@@ -31,6 +31,8 @@
 from __future__ import absolute_import as _abs
 
 import logging
+import typing
+from typing import Union
 from collections.abc import Iterable
 
 import numpy as np
@@ -179,6 +181,57 @@ def update(self, target, workload, cfg):
         self._config = cfg
 
 
+class ApplyFixedConfig(DispatchContext):
+    """Apply a config of a deterministic schedule.
+    This is used for building a single Relay operator with deterministic schedule
+    for testing schedules at Relay level.
+
+    Parameters
+    ----------
+    tasks : list[tvm.autotvm.task.task.Task]
+        List of autoTVM tasks.
+    schedule_names : str, List[str]
+        Name of schedules to use.
+    """
+
+    def __init__(self, tasks, schedule_names: Union[str, typing.List[str]]):
+        super(ApplyFixedConfig, self).__init__()
+        if isinstance(schedule_names, str):
+            self._schedule_names = list(schedule_names)
+        elif isinstance(schedule_names, list):
+            self._schedule_names = schedule_names
+        else:
+            raise RuntimeError("Incorrect type: " + schedule_names)
+        self._tasks = tasks
+        self.workload = None
+
+    def _query_inside(self, target, workload):
+        """Override query"""
+        self.workload = workload
+
+        # Create a config from correct task
+        for task in self._tasks:
+            if task.name == workload[0]:
+                config = task.config_space.get(0)
+                break
+
+        if not config:
+            raise RuntimeError(
+                "workload: %s does not exist in %s" % (str(workload), str(self._tasks))
+            )
+        # Add low cost to the target schedule and high cost to others.
+        if workload[0] in self._schedule_names:
+            config.cost = 1e-6
+        else:
+            config.cost = 100000
+        return config
+
+    def update(self, target, workload, cfg):
+        """Override update"""
+        self.workload = workload
+        self._config = cfg
+
+
 class ApplyHistoryBest(DispatchContext):
     """
     Apply the history best config
diff --git a/python/tvm/micro/testing/aot_test_utils.py b/python/tvm/micro/testing/aot_test_utils.py
new file mode 100644
index 0000000000000..82ac1ac68e9da
--- /dev/null
+++ b/python/tvm/micro/testing/aot_test_utils.py
@@ -0,0 +1,105 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import logging
+import itertools
+import shutil
+
+import pytest
+
+pytest.importorskip("tvm.micro")
+
+import tvm
+from tvm.testing.aot import AOTTestRunner
+
+_LOG = logging.getLogger(__name__)
+
+
+AOT_DEFAULT_RUNNER = AOTTestRunner()
+
+# AOT Test Runner using the Arm® Corstone™-300 Reference Systems
+# see: https://developer.arm.com/ip-products/subsystem/corstone/corstone-300
+AOT_CORSTONE300_RUNNER = AOTTestRunner(
+    makefile="corstone300",
+    prologue="""
+    uart_init();
+    """,
+    includes=["uart.h"],
+    pass_config={
+        "relay.ext.cmsisnn.options": {
+            "mcpu": "cortex-m55",
+        }
+    },
+)
+
+AOT_USMP_CORSTONE300_RUNNER = AOTTestRunner(
+    makefile="corstone300",
+    prologue="""
+    uart_init();
+    """,
+    includes=["uart.h"],
+    pass_config={
+        "relay.ext.cmsisnn.options": {
+            "mcpu": "cortex-m55",
+        },
+        "tir.usmp.enable": True,
+    },
+)
+
+
+def parametrize_aot_options(test):
+    """Parametrize over valid option combinations"""
+
+    requires_arm_eabi = pytest.mark.skipif(
+        shutil.which("arm-none-eabi-gcc") is None, reason="ARM embedded toolchain unavailable"
+    )
+
+    interface_api = ["packed", "c"]
+    use_unpacked_api = [True, False]
+    test_runner = [AOT_DEFAULT_RUNNER, AOT_CORSTONE300_RUNNER]
+
+    all_combinations = itertools.product(interface_api, use_unpacked_api, test_runner)
+
+    # Filter out packed operators with c interface
+    valid_combinations = filter(
+        lambda parameters: not (parameters[0] == "c" and not parameters[1]),
+        all_combinations,
+    )
+
+    # Only use reference system for C interface and unpacked API calls
+    valid_combinations = filter(
+        lambda parameters: not (
+            parameters[2] == AOT_CORSTONE300_RUNNER
+            and (parameters[0] == "packed" or not parameters[1])
+        ),
+        valid_combinations,
+    )
+
+    # Skip reference system tests if running in i386 container
+    marked_combinations = map(
+        lambda parameters: pytest.param(*parameters, marks=[requires_arm_eabi])
+        if parameters[2] == AOT_CORSTONE300_RUNNER
+        else parameters,
+        valid_combinations,
+    )
+
+    fn = pytest.mark.parametrize(
+        ["interface_api", "use_unpacked_api", "test_runner"],
+        marked_combinations,
+    )(test)
+
+    return tvm.testing.skip_if_32bit(reason="Reference system unavailable in i386 container")(fn)
diff --git a/python/tvm/micro/testing.py b/python/tvm/micro/testing/utils.py
similarity index 100%
rename from python/tvm/micro/testing.py
rename to python/tvm/micro/testing/utils.py
diff --git a/tests/python/relay/aot/aot_test_utils.py b/python/tvm/testing/aot.py
similarity index 72%
rename from tests/python/relay/aot/aot_test_utils.py
rename to python/tvm/testing/aot.py
index 2c4262a3d2be1..f8f170366ac55 100644
--- a/tests/python/relay/aot/aot_test_utils.py
+++ b/python/tvm/testing/aot.py
@@ -14,39 +14,41 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""Common functions for AOT test cases"""
 import sys
 import datetime
-import itertools
-import json
-import logging
 import os
 import pathlib
-import platform
 import re
 import shutil
 import subprocess
 import tarfile
 import tempfile
+import logging
 from typing import Any, NamedTuple, Union, Optional, List, Dict
-
-import pytest
 import numpy as np
 
-pytest.importorskip("tvm.micro")
-
 import tvm
 from tvm import relay
-from tvm import te
+from tvm import autotvm
 from tvm.contrib import utils, graph_executor
-from tvm.relay.backend import te_compiler, Executor, Runtime
-from tvm.relay.backend.te_compiler import TECompiler
+from tvm.relay.backend import Executor, Runtime
 from tvm.relay.backend.utils import mangle_module_name
 from tvm.micro import export_model_library_format
-from tvm.micro.testing import mlf_extract_workspace_size_bytes
+from tvm.micro.testing.utils import mlf_extract_workspace_size_bytes
 
 _LOG = logging.getLogger(__name__)
 
+NP_TYPE_TO_C = {
+    "int8": "int8_t",
+    "uint8": "uint8_t",
+    "int16": "int16_t",
+    "uint16": "uint16_t",
+    "int32": "int32_t",
+    "uint32": "uint32_t",
+    "float32": "float",
+}
+
 AOT_SUCCESS_TOKEN = "AOT_TEST_SUCCESS"
 AOT_FAILURE_TOKEN = "AOT_TEST_FAILURE"
 
@@ -138,119 +140,7 @@ class AOTTestRunner(NamedTuple):
     pass_config: Dict[str, Any] = {}
 
 
-AOT_DEFAULT_RUNNER = AOTTestRunner()
-
-# AOT Test Runner using the Arm® Corstone™-300 Reference Systems
-# see: https://developer.arm.com/ip-products/subsystem/corstone/corstone-300
-AOT_CORSTONE300_RUNNER = AOTTestRunner(
-    makefile="corstone300",
-    prologue="""
-    uart_init();
-    """,
-    includes=["uart.h"],
-    pass_config={
-        "relay.ext.cmsisnn.options": {
-            "mcpu": "cortex-m55",
-        }
-    },
-)
-
-AOT_USMP_CORSTONE300_RUNNER = AOTTestRunner(
-    makefile="corstone300",
-    prologue="""
-    uart_init();
-    """,
-    includes=["uart.h"],
-    pass_config={
-        "relay.ext.cmsisnn.options": {
-            "mcpu": "cortex-m55",
-        },
-        "tir.usmp.enable": True,
-    },
-)
-
-NP_TYPE_TO_C = {
-    "int8": "int8_t",
-    "uint8": "uint8_t",
-    "int16": "int16_t",
-    "uint16": "uint16_t",
-    "int32": "int32_t",
-    "uint32": "uint32_t",
-    "float32": "float",
-}
-
-
-def mangle_name(mod_name, name):
-    mod_name = mangle_module_name(mod_name)
-    return mod_name + "_" + name
-
-
-def convert_to_relay(
-    tflite_model_buf,
-):
-    """Convert a tflite model buffer in a Relay module"""
-    # TFLite.Model.Model has changed to TFLite.Model from 1.14 to 2.1
-    try:
-        import tflite.Model
-
-        tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
-    except AttributeError:
-        import tflite
-
-        tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
-    except ImportError:
-        raise ImportError("The tflite package must be installed")
-
-    mod, params = relay.frontend.from_tflite(tflite_model)
-    mod["main"] = relay.build_module.bind_params_by_name(mod["main"], params)
-    return mod, params
-
-
-def parametrize_aot_options(test):
-    """Parametrize over valid option combinations"""
-
-    requires_arm_eabi = pytest.mark.skipif(
-        shutil.which("arm-none-eabi-gcc") is None, reason="ARM embedded toolchain unavailable"
-    )
-
-    interface_api = ["packed", "c"]
-    use_unpacked_api = [True, False]
-    test_runner = [AOT_DEFAULT_RUNNER, AOT_CORSTONE300_RUNNER]
-
-    all_combinations = itertools.product(interface_api, use_unpacked_api, test_runner)
-
-    # Filter out packed operators with c interface
-    valid_combinations = filter(
-        lambda parameters: not (parameters[0] == "c" and not parameters[1]),
-        all_combinations,
-    )
-
-    # Only use reference system for C interface and unpacked API calls
-    valid_combinations = filter(
-        lambda parameters: not (
-            parameters[2] == AOT_CORSTONE300_RUNNER
-            and (parameters[0] == "packed" or not parameters[1])
-        ),
-        valid_combinations,
-    )
-
-    # Skip reference system tests if running in i386 container
-    marked_combinations = map(
-        lambda parameters: pytest.param(*parameters, marks=[requires_arm_eabi])
-        if parameters[2] == AOT_CORSTONE300_RUNNER
-        else parameters,
-        valid_combinations,
-    )
-
-    fn = pytest.mark.parametrize(
-        ["interface_api", "use_unpacked_api", "test_runner"],
-        marked_combinations,
-    )(test)
-
-    return tvm.testing.skip_if_32bit(reason="Reference system unavailable in i386 container")(fn)
-
-
-def subprocess_check_log_output(cmd, cwd, logfile):
+def _subprocess_check_log_output(cmd, cwd, logfile):
     """
     This method runs a process and logs the output to both a log file and stdout
     """
@@ -290,15 +180,21 @@ def subprocess_check_log_output(cmd, cwd, logfile):
         raise RuntimeError(f"Subprocess failed: {cmd}\nstdout:\n{stdout}")
 
 
+def _mangle_name(mod_name, name):
+    mod_name = mangle_module_name(mod_name)
+    return mod_name + "_" + name
+
+
 # TODO: Move to linker script with list of symbols rather than coding into source
-def emit_data_linkage(output_file, data_linkage):
+def _emit_data_linkage(output_file, data_linkage):
     if data_linkage is not None:
         output_file.write(
-            f'__attribute__((section("{data_linkage.section}"), aligned({data_linkage.alignment}))) '
+            f'__attribute__((section("{data_linkage.section}"), '
+            f"aligned({data_linkage.alignment}))) "
         )
 
 
-def emit_main_prologue(
+def _emit_main_prologue(
     main_file,
     custom_prologue,
     workspace_bytes,
@@ -316,16 +212,14 @@ def emit_main_prologue(
         # Add TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES because of memory alignment.
         workspace_define += " + TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES)\n"
         main_file.write(workspace_define)
-        emit_data_linkage(main_file, data_linkage)
+        _emit_data_linkage(main_file, data_linkage)
         main_file.write("static uint8_t g_aot_memory[WORKSPACE_SIZE];\n")
         main_file.write("tvm_workspace_t app_workspace;\n")
         main_file.write(
-            """
-            
+            """\n
 tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) {
     return StackMemoryManager_Allocate(&app_workspace, num_bytes, out_ptr);
 }
-
 tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
     return StackMemoryManager_Free(&app_workspace,ptr);
 }
@@ -334,30 +228,24 @@ def emit_main_prologue(
     else:
         # An implementation is not needed for these if the stack allocator is not used
         main_file.write(
-            """
-            
+            """\n
 tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) {
     return kTvmErrorFunctionCallNotImplemented;
 }
-
 tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
     return kTvmErrorFunctionCallNotImplemented;
 }
-
             """
         )
     main_file.write(
-        """
-    
+        """\n
 void TVMPlatformAbort(tvm_crt_error_t code) { exit(-1); }
-
 void TVMLogf(const char* msg, ...) {
   va_list args;
   va_start(args, msg);
   vfprintf(stdout, msg, args);
   va_end(args);
-}
-    
+}\n
 TVM_DLL int TVMFuncRegisterGlobal(const char* name, TVMFunctionHandle f, int override) {}
 int main(){\n
     """
@@ -365,105 +253,105 @@ def emit_main_prologue(
     main_file.write(custom_prologue)
 
 
-def emit_main_data(main_file, input_map, output_map, mod_name):
+def _emit_main_data(main_file, input_map, output_map, mod_name):
     for key in input_map:
         sanitized_tensor_name = re.sub(r"\W", "_", key)
         main_file.write(
-            f'#include "{mangle_name(mod_name,"input_data")}_{sanitized_tensor_name}.h"\n'
+            f'#include "{_mangle_name(mod_name,"input_data")}_{sanitized_tensor_name}.h"\n'
         )
 
     for key in output_map:
         sanitized_tensor_name = re.sub(r"\W", "_", key)
         main_file.write(
-            f'#include "{mangle_name(mod_name,"expected_output_data")}_{sanitized_tensor_name}.h"\n'
-            f'#include "{mangle_name(mod_name,"output_data")}_{sanitized_tensor_name}.h"\n'
+            f'#include "{_mangle_name(mod_name,"expected_output_data")}_'
+            f'{sanitized_tensor_name}.h"\n'
+            f'#include "{_mangle_name(mod_name,"output_data")}_'
+            f'{sanitized_tensor_name}.h"\n'
         )
 
 
-def emit_main_device_structs(main_file, devices, mod_name):
+def _emit_main_device_structs(main_file, devices, mod_name):
     if devices:
         main_file.write(
-            f"struct {mangle_name(mod_name, 'devices')} {mangle_name(mod_name, 'devices')} = {{"
+            f"struct {_mangle_name(mod_name, 'devices')} {_mangle_name(mod_name, 'devices')} = {{"
         )
         for device in devices:
             main_file.write(f"\t.{device} = {device},\n")
         main_file.write("};\n")
 
 
-def emit_main_workspace_pool_structs(main_file, workspace_pool_names, mod_name):
+def _emit_main_workspace_pool_structs(main_file, workspace_pool_names, mod_name):
     if workspace_pool_names and len(workspace_pool_names) > 0:
         main_file.write(
-            f"struct {mangle_name(mod_name, 'workspace_pools')} {mangle_name(mod_name, 'workspace_pools')} = {{"
+            f"struct {_mangle_name(mod_name, 'workspace_pools')} "
+            f"{_mangle_name(mod_name, 'workspace_pools')} = {{"
         )
         for workspace_pool_name in workspace_pool_names:
             main_file.write(f"\t.{workspace_pool_name} = {workspace_pool_name},\n")
         main_file.write("};\n")
 
 
-def emit_main_data_structs(main_file, input_map, output_map, mod_name):
+def _emit_main_data_structs(main_file, input_map, output_map, mod_name):
     main_file.write(
-        f"struct {mangle_name(mod_name, 'inputs')} {mangle_name(mod_name, 'inputs')} = {{"
+        f"struct {_mangle_name(mod_name, 'inputs')} {_mangle_name(mod_name, 'inputs')} = {{"
     )
     for key in input_map:
         sanitized_tensor_name = re.sub(r"\W", "_", key)
         main_file.write(
-            f"\t.{sanitized_tensor_name} = {mangle_name(mod_name, 'input_data')}_{sanitized_tensor_name},\n"
+            f"\t.{sanitized_tensor_name} = "
+            f"{_mangle_name(mod_name, 'input_data')}_{sanitized_tensor_name},\n"
         )
     main_file.write("};\n")
 
     main_file.write(
-        f"struct {mangle_name(mod_name, 'outputs')} {mangle_name(mod_name, 'outputs')} = {{"
+        f"struct {_mangle_name(mod_name, 'outputs')} {_mangle_name(mod_name, 'outputs')} = {{"
     )
     for key in output_map:
         sanitized_tensor_name = re.sub(r"\W", "_", key)
         main_file.write(
-            f"\t.{sanitized_tensor_name} = {mangle_name(mod_name, 'output_data')}_{sanitized_tensor_name},\n"
+            f"\t.{sanitized_tensor_name} = {_mangle_name(mod_name, 'output_data')}_"
+            f"{sanitized_tensor_name},\n"
         )
     main_file.write("};\n")
 
 
-def emit_main_data_setup(main_file, input_map, output_map, mod_name):
+def _emit_main_data_setup(main_file, input_map, output_map, mod_name):
     num_outputs = len(output_map)
     num_inputs = len(input_map)
-
-    main_file.write(f'void* {mangle_name(mod_name,"inputs")}[{num_inputs}] = {{ ')
+    main_file.write(f'void* {_mangle_name(mod_name,"inputs")}[{num_inputs}] = {{ ')
     for key in input_map:
         sanitized_tensor_name = re.sub(r"\W", "_", key)
-        main_file.write(f'{mangle_name(mod_name,"input_data")}_{sanitized_tensor_name}, ')
+        main_file.write(f'{_mangle_name(mod_name,"input_data")}_{sanitized_tensor_name}, ')
     main_file.write("};\n")
-
-    main_file.write(f'void* {mangle_name(mod_name,"outputs")}[{num_outputs}]  = {{ ')
+    main_file.write(f'void* {_mangle_name(mod_name,"outputs")}[{num_outputs}]  = {{ ')
     for key in output_map:
         sanitized_tensor_name = re.sub(r"\W", "_", key)
-        main_file.write(f'{mangle_name(mod_name, "output_data")}_{sanitized_tensor_name}, ')
+        main_file.write(f'{_mangle_name(mod_name, "output_data")}_{sanitized_tensor_name}, ')
     main_file.write("};\n")
 
 
-def emit_main_c_interface_call(
+def _emit_main_c_interface_call(
     main_file, devices, workspace_pool_names, mod_name, use_workspace_io
 ):
     sub_strings = list()
-    sub_strings.append(f'{mangle_name(mod_name,"run")}(')
+    sub_strings.append(f'{_mangle_name(mod_name,"run")}(')
     if not use_workspace_io:
-        sub_strings.append(f'&{mangle_name(mod_name,"inputs")}, ')
-        sub_strings.append(f'&{mangle_name(mod_name,"outputs")}, ')
+        sub_strings.append(f'&{_mangle_name(mod_name,"inputs")}, ')
+        sub_strings.append(f'&{_mangle_name(mod_name,"outputs")}, ')
     if workspace_pool_names:
-        sub_strings.append(f'&{mangle_name(mod_name,"workspace_pools")}, ')
+        sub_strings.append(f'&{_mangle_name(mod_name,"workspace_pools")}, ')
     if devices:
-        sub_strings.append(f'&{mangle_name(mod_name,"devices")}, ')
+        sub_strings.append(f'&{_mangle_name(mod_name,"devices")}, ')
     # Removing the last two characters that is a comma and a space
     sub_strings[-1] = sub_strings[-1][:-2]
     # Adding brackets and newline instead
     sub_strings[-1] = sub_strings[-1] + ");\n"
 
-    main_file_string = ""
-    for sub_string in sub_strings:
-        main_file_string += sub_string
-
+    main_file_string = "".join(sub_strings)
     main_file.write(main_file_string)
 
 
-def emit_main_fake_packed_values(main_file):
+def _emit_main_fake_packed_values(main_file):
     main_file.write(
         """
     static DLDevice fake_device = {kDLCPU, 0};
@@ -473,10 +361,10 @@ def emit_main_fake_packed_values(main_file):
     )
 
 
-def emit_main_packed_call(main_file, input_map, output_list, mod_name):
-    tensors_name = mangle_name(mod_name, "tensors")
-    values_name = mangle_name(mod_name, "values")
-    typeids_name = mangle_name(mod_name, "typeids")
+def _emit_main_packed_call(main_file, input_map, output_list, mod_name):
+    tensors_name = _mangle_name(mod_name, "tensors")
+    values_name = _mangle_name(mod_name, "values")
+    typeids_name = _mangle_name(mod_name, "typeids")
 
     def fake_tensor(source, source_index, packed_index):
         main_file.write(
@@ -503,20 +391,20 @@ def fake_tensor(source, source_index, packed_index):
     )
 
     for i in range(0, num_inputs):
-        fake_tensor(mangle_name(mod_name, "inputs"), i, i)
+        fake_tensor(_mangle_name(mod_name, "inputs"), i, i)
     for i in range(0, num_outputs):
-        fake_tensor(mangle_name(mod_name, "outputs"), i, i + num_inputs)
+        fake_tensor(_mangle_name(mod_name, "outputs"), i, i + num_inputs)
 
     main_file.write(
-        f'{mangle_name(mod_name, "run")}({values_name}, {typeids_name}, 0, NULL, 0, NULL);\n'
+        f'{_mangle_name(mod_name, "run")}({values_name}, {typeids_name}, 0, NULL, 0, NULL);\n'
     )
     main_file.write("\n")
 
 
-def emit_main_compare(main_file, outputs, output_tolerance, mod_name, use_interface_c=False):
+def _emit_main_compare(main_file, outputs, output_tolerance, mod_name, use_interface_c=False):
     for key in outputs:
         sanitized_tensor_name = re.sub(r"\W", "_", key)
-        expected_data_name = mangle_name(mod_name, f"expected_output_data_{sanitized_tensor_name}")
+        expected_data_name = _mangle_name(mod_name, f"expected_output_data_{sanitized_tensor_name}")
         is_float_dtype = outputs[key].dtype == "float32"
 
         comparison_function = "abs"
@@ -526,40 +414,39 @@ def emit_main_compare(main_file, outputs, output_tolerance, mod_name, use_interf
             tolerance = output_tolerance or 0.001
 
         data_length_var_name = (
-            mangle_name(mod_name, f"output_data_{sanitized_tensor_name}") + "_len"
+            _mangle_name(mod_name, f"output_data_{sanitized_tensor_name}") + "_len"
         )
         if use_interface_c:
             c_type = NP_TYPE_TO_C[str(outputs[key].dtype)]
-            actual_data_name = f"(({c_type}*)" + mangle_name(
+            actual_data_name = f"(({c_type}*)" + _mangle_name(
                 mod_name, f"outputs.{sanitized_tensor_name})"
             )
         else:
-            actual_data_name = mangle_name(mod_name, f"output_data_{sanitized_tensor_name}")
+            actual_data_name = _mangle_name(mod_name, f"output_data_{sanitized_tensor_name}")
         main_file.write(
-            f"""
-            for (int i = 0; i<{data_length_var_name}; i++) {{
-                if ({comparison_function}({actual_data_name}[i]-{expected_data_name}[i]) > {tolerance}) {{
-                    printf("{AOT_FAILURE_TOKEN}\\n");
-                    return -1;
-                }}
-            }}
-            """
+            f"for (int i = 0; i<{data_length_var_name}; i++) {{\n"
+            f"\tif ({comparison_function}({actual_data_name}[i]-"
+            f"{expected_data_name}[i]) > {tolerance}) {{\n"
+            f'\t\tprintf("{AOT_FAILURE_TOKEN}\\n");\n'
+            f"\t\treturn -1;\n"
+            f"\t}}\n"
+            f"}}"
         )
 
 
-def emit_main_init_memory_manager(main_file):
+def _emit_main_init_memory_manager(main_file):
     main_file.write("StackMemoryManager_Init(&app_workspace, g_aot_memory, WORKSPACE_SIZE);")
     main_file.write("\n")
 
 
-def emit_main_epilogue(main_file, custom_epilogue):
+def _emit_main_epilogue(main_file, custom_epilogue):
     main_file.write(custom_epilogue)
     main_file.write(f'printf("{AOT_SUCCESS_TOKEN}\\n");')
     main_file.write("return 0;")
     main_file.write("}\n")
 
 
-def emit_main_common_includes(main_file, custom_includes):
+def _emit_main_common_includes(main_file, custom_includes):
     main_file.write("#include <stdio.h>\n")
     main_file.write("#include <stdarg.h>\n")
     main_file.write("#include <stdlib.h>\n")
@@ -570,11 +457,11 @@ def emit_main_common_includes(main_file, custom_includes):
         main_file.write(f'#include "{include}"\n')
 
 
-def emit_main_micro_include(main_file, mod_name):
+def _emit_main_micro_include(main_file, mod_name):
     main_file.write(f"#include <{mangle_module_name(mod_name)}.h>\n")
 
 
-def create_main(
+def _create_main(
     test_name,
     compiled_models,
     output_path,
@@ -591,17 +478,17 @@ def create_main(
     # create header file
     raw_path = file_path.with_suffix(".c").resolve()
     with open(raw_path, "w") as main_file:
-        emit_main_common_includes(main_file, custom_includes)
+        _emit_main_common_includes(main_file, custom_includes)
 
         if interface_api == "c":
             for compiled_model in compiled_models:
                 model = compiled_model.model
-                emit_main_micro_include(main_file, model.name)
+                _emit_main_micro_include(main_file, model.name)
         for compiled_model in compiled_models:
             model = compiled_model.model
-            emit_main_data(main_file, model.inputs, model.outputs, model.name)
+            _emit_main_data(main_file, model.inputs, model.outputs, model.name)
 
-        emit_main_prologue(
+        _emit_main_prologue(
             main_file,
             custom_prologue,
             workspace_bytes,
@@ -611,7 +498,7 @@ def create_main(
             use_stack_allocator,
         )
         if use_stack_allocator:
-            emit_main_init_memory_manager(main_file)
+            _emit_main_init_memory_manager(main_file)
 
         if interface_api == "c":
             for compiled_model in compiled_models:
@@ -627,32 +514,33 @@ def create_main(
                         for allocated_pool in dict(executor_codegen_metadata.pool_inputs).values()
                         if not allocated_pool.pool_info.is_internal
                     ]
-                emit_main_device_structs(main_file, devices, model.name)
+                _emit_main_device_structs(main_file, devices, model.name)
                 if not use_workspace_io:
-                    emit_main_workspace_pool_structs(main_file, workspace_pool_names, model.name)
-                    emit_main_data_structs(main_file, model.inputs, model.outputs, model.name)
-                emit_main_c_interface_call(
+                    _emit_main_workspace_pool_structs(main_file, workspace_pool_names, model.name)
+                    _emit_main_data_structs(main_file, model.inputs, model.outputs, model.name)
+                _emit_main_c_interface_call(
                     main_file, devices, workspace_pool_names, model.name, use_workspace_io
                 )
         else:
-            emit_main_fake_packed_values(main_file)
+            _emit_main_fake_packed_values(main_file)
             for compiled_model in compiled_models:
                 model = compiled_model.model
-                emit_main_data_setup(main_file, model.inputs, model.outputs, model.name)
-                emit_main_packed_call(main_file, model.inputs, model.outputs, model.name)
+                _emit_main_data_setup(main_file, model.inputs, model.outputs, model.name)
+                _emit_main_packed_call(main_file, model.inputs, model.outputs, model.name)
 
         for compiled_model in compiled_models:
             model = compiled_model.model
-            emit_main_compare(
+            _emit_main_compare(
                 main_file, model.outputs, model.output_tolerance, model.name, interface_api == "c"
             )
-        emit_main_epilogue(main_file, custom_epilogue)
+        _emit_main_epilogue(main_file, custom_epilogue)
 
 
-def create_header_file(tensor_name, npy_data, output_path, data_linkage):
+def _create_header_file(tensor_name, npy_data, output_path, data_linkage):
     """
     This method generates a header file containing the data contained in the numpy array provided.
-    It is used to capture the tensor data (for both inputs and expected outputs) to be bundled into the standalone application.
+    It is used to capture the tensor data (for both inputs and expected outputs)
+    to be bundled into the standalone application.
     """
     file_path = pathlib.Path(f"{output_path}/" + tensor_name).resolve()
     # create header file
@@ -663,7 +551,7 @@ def create_header_file(tensor_name, npy_data, output_path, data_linkage):
         header_file.write("#include <dlpack/dlpack.h>\n")
         header_file.write(f"const size_t {tensor_name}_len = {npy_data.size};\n")
 
-        emit_data_linkage(header_file, data_linkage)
+        _emit_data_linkage(header_file, data_linkage)
 
         header_file.write(f"{NP_TYPE_TO_C[str(npy_data.dtype)]} {tensor_name}[] =")
 
@@ -673,6 +561,27 @@ def create_header_file(tensor_name, npy_data, output_path, data_linkage):
         header_file.write("};\n\n")
 
 
+def convert_to_relay(
+    tflite_model_buf,
+):
+    """Convert a tflite model buffer in a Relay module"""
+    # TFLite.Model.Model has changed to TFLite.Model from 1.14 to 2.1
+    try:
+        import tflite.Model  # pylint: disable=import-outside-toplevel
+
+        tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+    except AttributeError:
+        import tflite  # pylint: disable=import-outside-toplevel
+
+        tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+    except ImportError:
+        raise ImportError("The tflite package must be installed")
+
+    mod, params = relay.frontend.from_tflite(tflite_model)
+    mod["main"] = relay.build_module.bind_params_by_name(mod["main"], params)
+    return mod, params
+
+
 def compile_models(
     models: Union[List[AOTTestModel], AOTTestModel],
     interface_api: str,
@@ -683,6 +592,7 @@ def compile_models(
     use_runtime_executor: bool = True,
     target: tvm.target.Target = tvm.target.Target("c"),
     workspace_memory_pools=None,
+    schedule_name: str = None,
 ) -> List[AOTCompiledTestModel]:
     """
     This method generates runtime.Modules for the tests
@@ -708,31 +618,62 @@ def compile_models(
 
     compiled_mods = list()
     for model in models:
-        with tvm.transform.PassContext(opt_level=3, config=config):
-            # TODO(Mousius) - Remove once executor/runtime are fully removed from Target
-            if use_runtime_executor:
-                executor_factory = tvm.relay.build(
-                    model.module,
-                    target,
-                    executor=executor,
-                    runtime=runtime,
-                    workspace_memory_pools=workspace_memory_pools,
-                    params=model.params,
-                    mod_name=model.name,
-                )
-                compiled_mods.append(
-                    AOTCompiledTestModel(model=model, executor_factory=executor_factory)
-                )
-            else:
-                executor_factory = tvm.relay.build(
-                    model.module,
-                    tvm.target.Target(target, host=target),
-                    params=model.params,
-                    mod_name=model.name,
-                )
-                compiled_mods.append(
-                    AOTCompiledTestModel(model=model, executor_factory=executor_factory)
-                )
+        if schedule_name:
+            # Testing with deterministic schedule
+            task_list = autotvm.task.extract_from_program(
+                model.module, target=target, params=model.params
+            )
+            with tvm.autotvm.apply_fixed_config(task_list, schedule_name):
+                with tvm.transform.PassContext(opt_level=3, config=config):
+                    if use_runtime_executor:
+                        executor_factory = tvm.relay.build(
+                            model.module,
+                            target,
+                            executor=executor,
+                            runtime=runtime,
+                            workspace_memory_pools=workspace_memory_pools,
+                            params=model.params,
+                            mod_name=model.name,
+                        )
+                        compiled_mods.append(
+                            AOTCompiledTestModel(model=model, executor_factory=executor_factory)
+                        )
+                    else:
+                        executor_factory = tvm.relay.build(
+                            model.module,
+                            tvm.target.Target(target, host=target),
+                            params=model.params,
+                            mod_name=model.name,
+                        )
+                        compiled_mods.append(
+                            AOTCompiledTestModel(model=model, executor_factory=executor_factory)
+                        )
+        else:
+            with tvm.transform.PassContext(opt_level=3, config=config):
+                # TODO(Mousius) - Remove once executor/runtime are fully removed from Target
+                if use_runtime_executor:
+                    executor_factory = tvm.relay.build(
+                        model.module,
+                        target,
+                        executor=executor,
+                        runtime=runtime,
+                        workspace_memory_pools=workspace_memory_pools,
+                        params=model.params,
+                        mod_name=model.name,
+                    )
+                    compiled_mods.append(
+                        AOTCompiledTestModel(model=model, executor_factory=executor_factory)
+                    )
+                else:
+                    executor_factory = tvm.relay.build(
+                        model.module,
+                        tvm.target.Target(target, host=target),
+                        params=model.params,
+                        mod_name=model.name,
+                    )
+                    compiled_mods.append(
+                        AOTCompiledTestModel(model=model, executor_factory=executor_factory)
+                    )
     return compiled_mods
 
 
@@ -788,8 +729,8 @@ def run_and_check_body(base_path):
             workspace_bytes += model.extra_memory_in_bytes
             for key in model.inputs:
                 sanitized_tensor_name = re.sub(r"\W", "_", key)
-                create_header_file(
-                    f'{mangle_name(model.name, "input_data")}_{sanitized_tensor_name}',
+                _create_header_file(
+                    f'{_mangle_name(model.name, "input_data")}_{sanitized_tensor_name}',
                     model.inputs[key],
                     include_path,
                     data_linkage,
@@ -797,14 +738,14 @@ def run_and_check_body(base_path):
 
             for key in model.outputs:
                 sanitized_tensor_name = re.sub(r"\W", "_", key)
-                create_header_file(
-                    f'{mangle_name(model.name, "output_data")}_{sanitized_tensor_name}',
+                _create_header_file(
+                    f'{_mangle_name(model.name, "output_data")}_{sanitized_tensor_name}',
                     np.zeros(model.outputs[key].shape, model.outputs[key].dtype),
                     include_path,
                     data_linkage,
                 )
-                create_header_file(
-                    f'{mangle_name(model.name, "expected_output_data")}_{sanitized_tensor_name}',
+                _create_header_file(
+                    f'{_mangle_name(model.name, "expected_output_data")}_{sanitized_tensor_name}',
                     model.outputs[key],
                     include_path,
                     data_linkage,
@@ -814,7 +755,7 @@ def run_and_check_body(base_path):
         # We only need the stack allocator if USMP is not used
         use_stack_allocator = not use_usmp
 
-        create_main(
+        _create_main(
             "test.c",
             models,
             build_path,
@@ -830,8 +771,9 @@ def run_and_check_body(base_path):
 
         # Verify that compiles fine
         file_dir = os.path.dirname(os.path.abspath(__file__))
+        makefile_dir = os.path.join(file_dir, "../../../tests/python/relay/aot")
         codegen_path = os.path.join(base_path, "codegen")
-        makefile = os.path.join(file_dir, f"{runner.makefile}.mk")
+        makefile = os.path.join(makefile_dir, f"{runner.makefile}.mk")
         fvp_dir = "/opt/arm/FVP_Corstone_SSE-300/models/Linux64_GCC-6.4/"
         # TODO(@grant-arm): Remove once ci_cpu docker image has been updated to FVP_Corstone_SSE
         if not os.path.isdir(fvp_dir):
@@ -842,8 +784,8 @@ def run_and_check_body(base_path):
         make_command = (
             f"make -f {makefile} build_dir={build_path}"
             + f" CFLAGS='{cflags}'"
-            + f" TVM_ROOT={file_dir}/../../../.."
-            + f" AOT_TEST_ROOT={file_dir}"
+            + f" TVM_ROOT={file_dir}/../../.."
+            + f" AOT_TEST_ROOT={makefile_dir}"
             + f" CODEGEN_ROOT={codegen_path}"
             + f" STANDALONE_CRT_DIR={tvm.micro.get_standalone_crt_dir()}"
             + f" FVP_DIR={fvp_dir}"
@@ -854,7 +796,7 @@ def run_and_check_body(base_path):
         compile_command = f"{make_command} aot_test_runner"
         if verbose:
             print("Compile command:\n", compile_command)
-        subprocess_check_log_output(compile_command, ".", compile_log_path)
+        _subprocess_check_log_output(compile_command, ".", compile_log_path)
 
         # Verify that runs fine
         run_log_path = os.path.join(build_path, "test_run.log")
@@ -865,11 +807,11 @@ def run_and_check_body(base_path):
         # TODO(lhutton1) This is a quick and dirty work around to help temporarily reduce
         # the flakyness of the tests. Will remove once #10300 and #10314 are resolved.
         try:
-            subprocess_check_log_output(run_command, build_path, run_log_path)
+            _subprocess_check_log_output(run_command, build_path, run_log_path)
         except RuntimeError as err:
             print("Failed to run the module, having a second attempt...", file=sys.stderr)
             print(err, file=sys.stderr)
-            subprocess_check_log_output(run_command, build_path, run_log_path)
+            _subprocess_check_log_output(run_command, build_path, run_log_path)
 
         with open(run_log_path) as run_log:
             assert AOT_SUCCESS_TOKEN in run_log.read()
@@ -895,6 +837,7 @@ def compile_and_run(
     target_opts: Dict = None,
     test_dir: str = None,
     verbose: bool = False,
+    schedule_name: str = None,
 ):
     """This is a wrapper API to compile and run models as test for AoT
 
@@ -919,6 +862,7 @@ def compile_and_run(
         pass_config=runner.pass_config,
         use_runtime_executor=use_runtime_executor,
         target=tvm.target.Target(target),
+        schedule_name=schedule_name,
     )
 
     run_and_check(
diff --git a/tests/micro/zephyr/test_utils.py b/tests/micro/zephyr/test_utils.py
index e0aad7c3c6d57..4fd3e39fd1c02 100644
--- a/tests/micro/zephyr/test_utils.py
+++ b/tests/micro/zephyr/test_utils.py
@@ -32,7 +32,7 @@
 import tvm.micro
 from tvm.micro import export_model_library_format
 from tvm.micro.model_library_format import generate_c_interface_header
-from tvm.micro.testing import (
+from tvm.micro.testing.utils import (
     mlf_extract_workspace_size_bytes,
     aot_transport_init_wait,
     aot_transport_find_message,
diff --git a/tests/micro/zephyr/test_zephyr.py b/tests/micro/zephyr/test_zephyr.py
index 1582d7e4a5fec..49e5e2757b200 100644
--- a/tests/micro/zephyr/test_zephyr.py
+++ b/tests/micro/zephyr/test_zephyr.py
@@ -30,7 +30,7 @@
 from tvm.relay.backend import Executor, Runtime
 from tvm.relay.testing import byoc
 from tvm.contrib import utils
-from tvm.micro.testing import check_tune_log
+from tvm.micro.testing.utils import check_tune_log
 
 import test_utils
 
diff --git a/tests/micro/zephyr/test_zephyr_aot.py b/tests/micro/zephyr/test_zephyr_aot.py
index 87c7dc92fbda5..6b355f28de4b1 100644
--- a/tests/micro/zephyr/test_zephyr_aot.py
+++ b/tests/micro/zephyr/test_zephyr_aot.py
@@ -33,7 +33,6 @@
 from tvm.relay.backend import Executor, Runtime
 
 from tvm.contrib.download import download_testdata
-from tvm.micro.testing import aot_transport_init_wait, aot_transport_find_message
 
 import test_utils
 
diff --git a/tests/python/contrib/test_cmsisnn/test_binary_ops.py b/tests/python/contrib/test_cmsisnn/test_binary_ops.py
index 028ab406243ff..7846bba1e089c 100644
--- a/tests/python/contrib/test_cmsisnn/test_binary_ops.py
+++ b/tests/python/contrib/test_cmsisnn/test_binary_ops.py
@@ -36,12 +36,10 @@
     assert_partitioned_function,
     assert_no_external_function,
 )
-from tests.python.relay.aot.aot_test_utils import (
-    AOTTestModel,
+from tvm.testing.aot import generate_ref_data, AOTTestModel, compile_and_run
+from tvm.micro.testing.aot_test_utils import (
     AOT_CORSTONE300_RUNNER,
     AOT_USMP_CORSTONE300_RUNNER,
-    generate_ref_data,
-    compile_and_run,
 )
 
 
diff --git a/tests/python/contrib/test_cmsisnn/test_conv2d.py b/tests/python/contrib/test_cmsisnn/test_conv2d.py
index 47245f60e15ef..1cdf985101484 100644
--- a/tests/python/contrib/test_cmsisnn/test_conv2d.py
+++ b/tests/python/contrib/test_cmsisnn/test_conv2d.py
@@ -23,15 +23,9 @@
 from tvm import relay
 from tvm.relay.op.contrib import cmsisnn
 
+from tvm.testing.aot import generate_ref_data, AOTTestModel, compile_and_run
 
-from tests.python.relay.aot.aot_test_utils import (
-    AOTTestModel,
-    AOT_CORSTONE300_RUNNER,
-    AOT_USMP_CORSTONE300_RUNNER,
-    AOT_DEFAULT_RUNNER,
-    generate_ref_data,
-    compile_and_run,
-)
+from tvm.micro.testing.aot_test_utils import AOT_USMP_CORSTONE300_RUNNER
 from utils import (
     skip_if_no_reference_system,
     make_module,
diff --git a/tests/python/contrib/test_cmsisnn/test_fully_connected.py b/tests/python/contrib/test_cmsisnn/test_fully_connected.py
index ec2e9bbdcca77..111d3b2edac15 100644
--- a/tests/python/contrib/test_cmsisnn/test_fully_connected.py
+++ b/tests/python/contrib/test_cmsisnn/test_fully_connected.py
@@ -23,14 +23,9 @@
 from tvm import relay
 from tvm.relay.op.contrib import cmsisnn
 
-
-from tests.python.relay.aot.aot_test_utils import (
-    AOTTestModel,
-    AOT_CORSTONE300_RUNNER,
+from tvm.testing.aot import generate_ref_data, AOTTestModel, compile_and_run
+from tvm.micro.testing.aot_test_utils import (
     AOT_USMP_CORSTONE300_RUNNER,
-    AOT_DEFAULT_RUNNER,
-    generate_ref_data,
-    compile_and_run,
 )
 from utils import (
     skip_if_no_reference_system,
diff --git a/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py b/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py
index 7808fbf7752f2..d0a8547d32acd 100644
--- a/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py
+++ b/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py
@@ -22,12 +22,9 @@
 import tvm
 from tvm import relay
 
-
-from tests.python.relay.aot.aot_test_utils import (
-    AOTTestModel,
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import (
     AOT_USMP_CORSTONE300_RUNNER,
-    generate_ref_data,
-    compile_and_run,
 )
 from utils import (
     skip_if_no_reference_system,
diff --git a/tests/python/contrib/test_cmsisnn/test_networks.py b/tests/python/contrib/test_cmsisnn/test_networks.py
index a6e77515859e9..fefce9e86c2dc 100644
--- a/tests/python/contrib/test_cmsisnn/test_networks.py
+++ b/tests/python/contrib/test_cmsisnn/test_networks.py
@@ -28,16 +28,14 @@
 from tvm.relay.op.contrib import cmsisnn
 
 from utils import skip_if_no_reference_system, get_range_for_dtype_str
-from tests.python.relay.aot.aot_test_utils import (
-    AOTTestModel,
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import (
     AOT_CORSTONE300_RUNNER,
     AOT_USMP_CORSTONE300_RUNNER,
-    generate_ref_data,
-    compile_and_run,
 )
 
 
-def convert_to_relay(
+def _convert_to_relay(
     tflite_model_buf,
     input_data,
     input_node,
@@ -95,7 +93,7 @@ def test_cnn_small(test_runner):
     rng = np.random.default_rng(12345)
     input_data = rng.integers(in_min, high=in_max, size=input_shape, dtype=dtype)
 
-    orig_mod, params = convert_to_relay(tflite_model_buf, input_data, "input")
+    orig_mod, params = _convert_to_relay(tflite_model_buf, input_data, "input")
     cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod, params)
 
     # validate CMSIS-NN output against CPU output
diff --git a/tests/python/contrib/test_cmsisnn/test_pooling.py b/tests/python/contrib/test_cmsisnn/test_pooling.py
index cca1288ac2a0e..a2650bb8d0289 100644
--- a/tests/python/contrib/test_cmsisnn/test_pooling.py
+++ b/tests/python/contrib/test_cmsisnn/test_pooling.py
@@ -23,15 +23,8 @@
 from tvm import relay
 from tvm.relay.op.contrib import cmsisnn
 
-
-from tests.python.relay.aot.aot_test_utils import (
-    AOTTestModel,
-    AOT_CORSTONE300_RUNNER,
-    AOT_USMP_CORSTONE300_RUNNER,
-    AOT_DEFAULT_RUNNER,
-    generate_ref_data,
-    compile_and_run,
-)
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import AOT_USMP_CORSTONE300_RUNNER
 from utils import (
     skip_if_no_reference_system,
     make_module,
diff --git a/tests/python/contrib/test_cmsisnn/test_softmax.py b/tests/python/contrib/test_cmsisnn/test_softmax.py
index 6eac76d841b45..5a44a7865e66d 100644
--- a/tests/python/contrib/test_cmsisnn/test_softmax.py
+++ b/tests/python/contrib/test_cmsisnn/test_softmax.py
@@ -34,13 +34,8 @@
     assert_partitioned_function,
     assert_no_external_function,
 )
-from tests.python.relay.aot.aot_test_utils import (
-    AOTTestModel,
-    AOT_CORSTONE300_RUNNER,
-    AOT_USMP_CORSTONE300_RUNNER,
-    generate_ref_data,
-    compile_and_run,
-)
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import AOT_USMP_CORSTONE300_RUNNER
 
 
 def make_model(
diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py
index 0c42b024f274b..20bd12945f8f4 100644
--- a/tests/python/contrib/test_ethosu/infra.py
+++ b/tests/python/contrib/test_ethosu/infra.py
@@ -47,7 +47,7 @@
 import tvm.relay.testing.tf as tf_testing
 
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
-from tests.python.relay.aot.aot_test_utils import (
+from tvm.testing.aot import (
     AOTCompiledTestModel,
     AOTDataLinkage,
     AOTTestModel,
diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index 4268392f1b788..7ea813762796b 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -29,7 +29,7 @@
 from tvm.relay.backend.contrib.ethosu import util
 
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
-from tests.python.relay.aot.aot_test_utils import generate_ref_data
+from tvm.testing.aot import generate_ref_data
 
 from . import infra
 
diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py
index f64263ca0623b..b91168b7bbe6b 100644
--- a/tests/python/contrib/test_ethosu/test_networks.py
+++ b/tests/python/contrib/test_ethosu/test_networks.py
@@ -24,7 +24,7 @@
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 from tvm.micro import model_library_format as mlf
 
-from tests.python.relay.aot.aot_test_utils import convert_to_relay
+from tvm.testing.aot import convert_to_relay
 
 from . import infra
 
diff --git a/tests/python/integration/test_arm_mprofile_dsp.py b/tests/python/integration/test_arm_mprofile_dsp.py
index 484c19fa222ca..7628755af4ac9 100644
--- a/tests/python/integration/test_arm_mprofile_dsp.py
+++ b/tests/python/integration/test_arm_mprofile_dsp.py
@@ -20,12 +20,8 @@
 import tvm
 import tvm.testing
 from tvm import relay
-from tests.python.relay.aot.aot_test_utils import (
-    AOTTestModel,
-    AOT_CORSTONE300_RUNNER,
-    generate_ref_data,
-    compile_and_run,
-)
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER
 
 
 @tvm.testing.requires_corstone300
diff --git a/tests/python/relay/aot/test_c_device_api.py b/tests/python/relay/aot/test_c_device_api.py
index f9fa0c6eadbb9..3c7db62890f58 100644
--- a/tests/python/relay/aot/test_c_device_api.py
+++ b/tests/python/relay/aot/test_c_device_api.py
@@ -24,12 +24,8 @@
 
 from tvm import relay
 from tvm.ir.module import IRModule
-from aot_test_utils import (
-    AOT_DEFAULT_RUNNER,
-    AOTTestModel,
-    generate_ref_data,
-    compile_models,
-)
+from tvm.testing.aot import AOTTestModel, generate_ref_data, compile_models
+from tvm.micro.testing.aot_test_utils import AOT_DEFAULT_RUNNER
 
 
 @pytest.fixture
diff --git a/tests/python/relay/aot/test_cpp_aot.py b/tests/python/relay/aot/test_cpp_aot.py
index cdcc61c33ac74..4a12678a79d93 100644
--- a/tests/python/relay/aot/test_cpp_aot.py
+++ b/tests/python/relay/aot/test_cpp_aot.py
@@ -27,7 +27,8 @@
 from tvm import IRModule
 from tvm import relay
 from tvm.relay import backend, testing
-from aot_test_utils import AOT_DEFAULT_RUNNER, AOTTestModel, generate_ref_data, compile_and_run
+from tvm.testing.aot import generate_ref_data
+from tvm.micro.testing.aot_test_utils import AOT_DEFAULT_RUNNER
 
 
 def test_error_c_interface():
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index 2991cc01fc929..d1d80d434b6a6 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -37,16 +37,14 @@
 from tvm.micro import model_library_format as mlf
 from tvm.micro import export_model_library_format
 from tvm.ir.instrument import pass_instrument
-from aot_test_utils import (
+from tvm.testing.aot import (
     AOTTestModel,
-    AOT_DEFAULT_RUNNER,
     generate_ref_data,
-    convert_to_relay,
     compile_and_run,
     compile_models,
-    parametrize_aot_options,
     create_relay_module_and_inputs_from_tflite_file,
 )
+from tvm.micro.testing.aot_test_utils import AOT_DEFAULT_RUNNER, parametrize_aot_options
 
 
 def test_error_c_interface_with_packed_api():
diff --git a/tests/python/relay/aot/test_crt_aot_usmp.py b/tests/python/relay/aot/test_crt_aot_usmp.py
index 650cb4526f097..60b46d96b555a 100644
--- a/tests/python/relay/aot/test_crt_aot_usmp.py
+++ b/tests/python/relay/aot/test_crt_aot_usmp.py
@@ -32,14 +32,13 @@
 from tvm.relay.backend import Executor, Runtime
 from tvm import WorkspaceMemoryPools, PoolInfo
 from tvm.micro import model_library_format as mlf
-from aot_test_utils import (
+from tvm.micro.testing.aot_test_utils import parametrize_aot_options
+from tvm.testing.aot import (
     AOTTestModel,
     AOTTestRunner,
     generate_ref_data,
-    convert_to_relay,
     compile_and_run,
     compile_models,
-    parametrize_aot_options,
     run_and_check,
     create_relay_module_and_inputs_from_tflite_file,
 )
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv2d_nchw.py b/tests/python/relay/strategy/arm_cpu/test_conv2d_nchw.py
new file mode 100644
index 0000000000000..e88210a59e772
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_conv2d_nchw.py
@@ -0,0 +1,110 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import sys
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import (
+    AOT_CORSTONE300_RUNNER,
+)
+
+
+class BasicConv2dTests:
+    @tvm.testing.requires_corstone300
+    def test_conv2d(
+        self,
+        data_shape,
+        kernel_size,
+        kernel_layout,
+        num_filter,
+        strides,
+        padding,
+        dilation,
+        dtype,
+        schedule_name,
+    ):
+        """Test a subgraph with a single conv2d_nchw operator."""
+        ishape = data_shape
+        wshape = (num_filter, data_shape[1], *kernel_size)
+        weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
+
+        input0 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight0 = relay.const(weight_data)
+        out0 = relay.op.nn.conv2d(
+            input0,
+            weight0,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            dilation=(dilation, dilation),
+            data_layout="NCHW",
+            kernel_layout="OIHW",
+            out_dtype="int32",
+            out_layout="NCHW",
+        )
+        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+        input1 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight1 = relay.const(weight_data)
+
+        out1 = relay.op.nn.conv2d(
+            input1,
+            weight1,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            dilation=(dilation, dilation),
+            data_layout="NCHW",
+            kernel_layout=kernel_layout,
+            out_dtype="int32",
+            out_layout="NCHW",
+        )
+        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
+        output_list = generate_ref_data(ref_mod, inputs)
+
+        compile_and_run(
+            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+            runner=AOT_CORSTONE300_RUNNER,
+            interface_api="c",
+            use_unpacked_api=True,
+            target_opts={
+                "-keys": "arm_cpu",
+                "-mcpu": "cortex-m7",
+            },
+            schedule_name=schedule_name,
+        )
+
+
+class TestConv2d_OIHW_small_kernel(BasicConv2dTests):
+    """This test is for conv2d_nchw_spatial_pack.arm_cpu schedule."""
+
+    data_shape, kernel_size, num_filter, strides, padding, dilation, dtype = tvm.testing.parameters(
+        ((1, 16, 32, 32), (3, 3), 12, 1, 0, 1, "int8"),
+        ((1, 16, 32, 32), (3, 3), 12, 1, 0, 1, "int16"),
+        ((1, 32, 16, 16), (3, 3), 12, 1, 0, 1, "int16"),
+    )
+    kernel_layout = tvm.testing.parameter("OIHW")
+    schedule_name = tvm.testing.parameter("conv2d_nchw_spatial_pack.arm_cpu")
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv2d_nhwc.py b/tests/python/relay/strategy/arm_cpu/test_conv2d_nhwc.py
new file mode 100644
index 0000000000000..f56645d43672a
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_conv2d_nhwc.py
@@ -0,0 +1,154 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import sys
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER
+
+
+class BasicConv2dTests:
+    @tvm.testing.requires_corstone300
+    def test_conv2d(
+        self,
+        data_shape,
+        kernel_size,
+        kernel_layout,
+        num_filter,
+        strides,
+        padding,
+        dilation,
+        dtype,
+        schedule_name,
+    ):
+        """Test a subgraph with a single conv2d operator."""
+        ishape = data_shape
+        wshape = (*kernel_size, data_shape[-1], num_filter)
+
+        weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
+
+        input0 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight0 = relay.const(weight_data)
+        out0 = relay.op.nn.conv2d(
+            input0,
+            weight0,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            dilation=(dilation, dilation),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+            out_dtype="int32",
+            out_layout="NHWC",
+        )
+        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+        input1 = relay.var("input", relay.TensorType(ishape, dtype))
+
+        if kernel_layout == "HWOI":
+            weight1 = relay.const(np.moveaxis(weight_data, 2, -1))
+        elif kernel_layout == "HWIO":
+            weight1 = relay.const(weight_data)
+
+        out1 = relay.op.nn.conv2d(
+            input1,
+            weight1,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            dilation=(dilation, dilation),
+            data_layout="NHWC",
+            kernel_layout=kernel_layout,
+            out_dtype="int32",
+            out_layout="NHWC",
+        )
+        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
+        output_list = generate_ref_data(ref_mod, inputs)
+
+        compile_and_run(
+            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+            runner=AOT_CORSTONE300_RUNNER,
+            interface_api="c",
+            use_unpacked_api=True,
+            target_opts={
+                "-keys": "arm_cpu",
+                "-mcpu": "cortex-m7",
+            },
+            schedule_name=schedule_name,
+        )
+
+
+class TestConv2d_DSP_HWOI(BasicConv2dTests):
+    """This test is for conv2d_nhwc_dsp.arm_cpu schedule."""
+
+    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
+        # TODO(mehrdadh): Fails due to https://github.com/apache/tvm/issues/11216
+        # ((1, 32, 32, 1), (3, 3), 12, 1, 0, 1),
+        # ((1, 32, 10, 3), (3, 3), 16, 1, 0, 1),
+        # ((1, 49, 10, 1), (10, 4), 64, (2, 1), (4, 1, 5, 1), 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 2),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2),
+        # from Keyword Spotting model from MLPerfTiny models
+        # TODO(mehrdad): Fails due to https://github.com/apache/tvm/issues/11216
+        # ((1, 49, 10, 1), (10, 4), 64, (2, 2), (4, 1, 5, 1), 1),
+        # from Visual Wake Word model from MLPerfTiny models
+        # TODO(mehrdadh): fails due to https://github.com/apache/tvm/issues/11216
+        # ((1, 96, 96, 3), (3, 3), 8, (2, 2), (0, 0, 1, 1), 1),
+        # from Image Classification model from MLPerfTiny models
+        ((1, 16, 16, 32), (1, 1), 64, (2, 2), 0, 1),
+        ((4, 16, 16, 8), (5, 5), 8, 2, (0, 4, 4, 0), 1),
+        ((4, 16, 16, 8), (5, 5), 16, 2, (0, 4, 4, 0), 1),
+        ((4, 16, 16, 8), (5, 5), 8, 2, 0, 1),
+        ((4, 16, 16, 8), (5, 5), 16, 2, 0, 1),
+        ((1, 16, 16, 8), (3, 3), 16, 2, (0, 0, 1, 1), 1),
+        ((1, 16, 16, 8), (3, 3), 16, 2, (1, 1, 2, 2), 1),
+        ((1, 16, 16, 8), (5, 5), 16, 2, (3, 3, 2, 2), 1),
+        ((1, 16, 16, 8), (3, 3), 16, 2, (0, 1, 2, 3), 1),
+    )
+    dtype = tvm.testing.parameter("int8", "int16")
+    kernel_layout = tvm.testing.parameter("HWOI")
+    schedule_name = tvm.testing.parameter("conv2d_nhwc_dsp.arm_cpu")
+
+
+class TestConv2d_HWIO(BasicConv2dTests):
+    """This test is for conv2d_nhwc_spatial_pack.arm_cpu schedule."""
+
+    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
+        ((1, 32, 32, 1), (3, 3), 12, 1, 0, 1),
+        ((1, 32, 10, 3), (3, 3), 16, 1, 0, 1),
+        ((1, 49, 10, 1), (10, 4), 64, (2, 1), (4, 1, 5, 1), 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 2),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2),
+    )
+    dtype = tvm.testing.parameter("int8", "int16")
+    kernel_layout = tvm.testing.parameter("HWIO")
+    schedule_name = tvm.testing.parameter("conv2d_nhwc_spatial_pack.arm_cpu")
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
new file mode 100644
index 0000000000000..89f1fb1843b4d
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
@@ -0,0 +1,153 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import sys
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER
+
+
+class BasicDepthwiseConv2dTests:
+    @tvm.testing.requires_corstone300
+    def test_conv2d(
+        self,
+        data_shape,
+        data_layout,
+        kernel_size,
+        kernel_layout,
+        num_filter,
+        strides,
+        padding,
+        dilation,
+        dtype,
+        schedule_name,
+    ):
+        """Test a subgraph with a single conv2d operator."""
+        ishape = data_shape
+        groups = num_filter
+
+        assert groups > 1, f"groups should be more than 1 to create a depthwise conv2d."
+
+        if data_layout == "NCHW" and kernel_layout == "OIHW":
+            assert (
+                num_filter == data_shape[1]
+            ), f"Output channels({num_filter}) should be equal to input channels({data_shape[1]})."
+            wshape = (num_filter, data_shape[1] // groups, *kernel_size)
+        elif data_layout == "NHWC" and kernel_layout == "HWOI":
+            assert (
+                num_filter == data_shape[3]
+            ), f"Output channels({num_filter}) should be equal to input channels({data_shape[3]})."
+            wshape = (*kernel_size, num_filter, data_shape[3] // groups)
+        else:
+            raise ValueError(
+                f"Incorrect data layout({data_layout}) and kernel layout({kernel_layout})."
+            )
+
+        weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
+
+        input0 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight0 = relay.const(weight_data)
+        out0 = relay.op.nn.conv2d(
+            input0,
+            weight0,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            groups=groups,
+            dilation=(dilation, dilation),
+            data_layout=data_layout,
+            kernel_layout=kernel_layout,
+            out_dtype="int32",
+            out_layout=data_layout,
+        )
+        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+        input1 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight1 = relay.const(weight_data)
+        out1 = relay.op.nn.conv2d(
+            input1,
+            weight1,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            groups=groups,
+            dilation=(dilation, dilation),
+            data_layout=data_layout,
+            kernel_layout=kernel_layout,
+            out_dtype="int32",
+            out_layout=data_layout,
+        )
+        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
+        output_list = generate_ref_data(ref_mod, inputs)
+
+        compile_and_run(
+            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+            runner=AOT_CORSTONE300_RUNNER,
+            interface_api="c",
+            use_unpacked_api=True,
+            target_opts={
+                "-keys": "arm_cpu",
+                "-mcpu": "cortex-m7",
+            },
+            schedule_name=schedule_name,
+        )
+
+
+class TestDepthwiseConv2d_NCHW_OIHW(BasicDepthwiseConv2dTests):
+    """This test is for depthwise_conv2d_nchw.arm_cpu schedule."""
+
+    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
+        ((1, 16, 32, 32), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 10, 3), (3, 3), 32, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 32, 1, (0, 2, 2, 0), 1),
+        ((1, 32, 32, 16), (3, 3), 32, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 32, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 32, 1, (0, 2, 2, 0), 2),
+        ((1, 16, 32, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2),
+    )
+    data_layout = tvm.testing.parameter("NCHW")
+    dtype = tvm.testing.parameter("int8", "int16")
+    kernel_layout = tvm.testing.parameter("OIHW")
+    schedule_name = tvm.testing.parameter("depthwise_conv2d_nchw.arm_cpu")
+
+
+class TestDepthwiseConv2d_NHWC_HWOI(BasicDepthwiseConv2dTests):
+    """This test is for depthwise_conv2d_nhwc.generic schedule."""
+
+    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 10, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 49, 10, 64), (10, 4), 64, (2, 1), (4, 1, 5, 1), 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 2),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2),
+    )
+    data_layout = tvm.testing.parameter("NHWC")
+    dtype = tvm.testing.parameter("int8", "int16")
+    kernel_layout = tvm.testing.parameter("HWOI")
+    schedule_name = tvm.testing.parameter("depthwise_conv2d_nhwc.generic")
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py
new file mode 100644
index 0000000000000..d3f504d04e355
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py
@@ -0,0 +1,151 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import sys
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER
+
+
+class BasicGroupConv2dTests:
+    @tvm.testing.requires_corstone300
+    def test_conv2d(
+        self,
+        data_shape,
+        data_layout,
+        kernel_size,
+        kernel_layout,
+        num_filter,
+        strides,
+        padding,
+        dilation,
+        groups,
+        dtype,
+        schedule_name,
+    ):
+        """Test a subgraph with a single conv2d operator."""
+        ishape = data_shape
+
+        assert groups > 1, f"groups should be more than 1 to create a group conv2d."
+
+        if data_layout == "NCHW" and kernel_layout == "OIHW":
+            assert data_shape[1] % groups == 0
+            wshape = (num_filter, data_shape[1] // groups, *kernel_size)
+        elif data_layout == "NHWC" and kernel_layout == "HWIO":
+            assert data_shape[3] % groups == 0
+            wshape = (*kernel_size, data_shape[3] // groups, num_filter)
+        else:
+            raise ValueError(
+                f"Incorrect data layout({data_layout}) and kernel layout({kernel_layout})."
+            )
+
+        weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
+
+        input0 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight0 = relay.const(weight_data)
+        out0 = relay.op.nn.conv2d(
+            input0,
+            weight0,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            groups=groups,
+            dilation=(dilation, dilation),
+            data_layout=data_layout,
+            kernel_layout=kernel_layout,
+            out_dtype="int32",
+            out_layout=data_layout,
+        )
+        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+        input1 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight1 = relay.const(weight_data)
+        out1 = relay.op.nn.conv2d(
+            input1,
+            weight1,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            groups=groups,
+            dilation=(dilation, dilation),
+            data_layout=data_layout,
+            kernel_layout=kernel_layout,
+            out_dtype="int32",
+            out_layout=data_layout,
+        )
+        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
+        output_list = generate_ref_data(ref_mod, inputs)
+
+        compile_and_run(
+            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+            runner=AOT_CORSTONE300_RUNNER,
+            interface_api="c",
+            use_unpacked_api=True,
+            target_opts={
+                "-keys": "arm_cpu",
+                "-mcpu": "cortex-m7",
+            },
+            schedule_name=schedule_name,
+        )
+
+
+class TestGroupConv2d_NCHW_OIHW(BasicGroupConv2dTests):
+    """This test is for group_conv2d_nchw.arm_cpu schedule."""
+
+    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
+        ((1, 16, 32, 32), (3, 3), 12, 1, 0, 1),
+        ((1, 16, 32, 10), (3, 3), 16, 1, 0, 1),
+        ((1, 16, 32, 32), (3, 3), 16, 1, (0, 2, 2, 0), 1),
+        ((1, 16, 32, 32), (3, 3), 16, 1, 0, 1),
+        ((1, 16, 32, 32), (3, 3), 16, 1, 0, 1),
+        ((1, 16, 32, 32), (3, 3), 16, 1, (0, 2, 2, 0), 2),
+        ((1, 16, 32, 32), (3, 3), 32, 1, (1, 1, 2, 2), 2),
+    )
+    groups = tvm.testing.parameter(2, 4)
+    data_layout = tvm.testing.parameter("NCHW")
+    dtype = tvm.testing.parameter("int8", "int16")
+    kernel_layout = tvm.testing.parameter("OIHW")
+    schedule_name = tvm.testing.parameter("group_conv2d_nchw.arm_cpu")
+
+
+class TestGroupConv2d_NHWC_HWIO(BasicGroupConv2dTests):
+    """This test is for group_conv2d_nhwc.generic schedule."""
+
+    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
+        ((1, 32, 32, 16), (3, 3), 12, 1, 0, 1),
+        ((1, 32, 10, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 49, 10, 16), (10, 4), 64, (2, 1), (4, 1, 5, 1), 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 2),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2),
+    )
+    groups = tvm.testing.parameter(2, 4)
+    data_layout = tvm.testing.parameter("NHWC")
+    dtype = tvm.testing.parameter("int8", "int16")
+    kernel_layout = tvm.testing.parameter("HWIO")
+    schedule_name = tvm.testing.parameter("group_conv2d_nhwc.generic")
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/utils/external_codegen.py b/tests/python/relay/utils/external_codegen.py
index 4dbc8f274264d..6d3d917ff5a23 100644
--- a/tests/python/relay/utils/external_codegen.py
+++ b/tests/python/relay/utils/external_codegen.py
@@ -104,7 +104,8 @@ def check_aot_executor_result(
     mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", device=tvm.cpu()
 ):
     # Late import to avoid breaking test with USE_MICRO=OFF.
-    from aot.aot_test_utils import AOTTestModel, AOT_DEFAULT_RUNNER, compile_and_run
+    from tvm.testing.aot import AOTTestModel, compile_and_run
+    from tvm.micro.testing.aot_test_utils import AOT_DEFAULT_RUNNER
 
     interface_api = "packed"
     use_unpacked_api = False
diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py
index 3a93dbc89b1f5..d5611906fc5df 100644
--- a/tests/python/unittest/test_crt.py
+++ b/tests/python/unittest/test_crt.py
@@ -292,7 +292,7 @@ def test_platform_timer():
 def test_autotune():
     """Verify that autotune works with micro."""
     import tvm.relay as relay
-    from tvm.micro.testing import check_tune_log
+    from tvm.micro.testing.utils import check_tune_log
 
     runtime = Runtime("crt", {"system-lib": True})
 
diff --git a/tests/scripts/task_python_microtvm.sh b/tests/scripts/task_python_microtvm.sh
index d13ee91a0ba87..7301c6f833abb 100755
--- a/tests/scripts/task_python_microtvm.sh
+++ b/tests/scripts/task_python_microtvm.sh
@@ -51,3 +51,5 @@ export TVM_MICRO_USE_HW=1
 export TVM_MICRO_BOARD=qemu_x86
 python3 gallery/how_to/work_with_microtvm/micro_tflite.py
 python3 gallery/how_to/work_with_microtvm/micro_autotune.py
+
+run_pytest ctypes python-relay-strategy-arm_cpu tests/python/relay/strategy/arm_cpu --enable-corstone300-tests

From e02bf824d11019413ed1f8eb78da2b3427b0f026 Mon Sep 17 00:00:00 2001
From: Hua Jiang <huaj@xilinx.com>
Date: Thu, 19 May 2022 16:51:13 -0700
Subject: [PATCH 38/59] [Runtime][PipelineExecutor] Add graph manually
 splitting logic into the unit test. (#11334)

* [Runtime][PipelineExecutor] Add graph manually splitting example into
the unit test.

Current unit test create 3 seperate module then re-connect them to
run the pipeline executor. And this is not a real use case for pipeline
executor.

Adding a manually graph splitting logic which split a full network into 3
subgraph then run the pipeline executor and verify the result to
simulate the real use case.

* address review comments

* trigger build.

* address review comments

* address review comments

* rebase and trigger build.
---
 tests/python/relay/test_pipeline_executor.py | 224 +++++++++++++++++--
 1 file changed, 201 insertions(+), 23 deletions(-)

diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index b97966dde0c8b..541f3bba13da2 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -22,12 +22,195 @@
 import tvm
 import tvm.testing
 from tvm import relay
-from tvm.relay import transform
+from tvm.relay import transform, build_module
+from tvm.relay.testing import run_opt_pass
 from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build
 from tvm._ffi import get_global_func
 from tvm.contrib import cc as _cc
 
 
+def graph_split(expr, split_conf, params=None):
+    """Splitting the graph into a list of subgraphs"""
+
+    def get_dep_var(sub_var_dep):
+        return [var for var in sub_var_dep[len(sub_var_dep) - 1]["ref_nodes"]]
+
+    def parse_dependency(value, snode_dep, new_input_idx):
+        new_args = []
+        need_update = False
+        for var in value.args:
+            is_free_var = False
+            for dep in snode_dep[:-1]:
+                if var in dep["nodes"]:
+                    # Mark the previous subgraph node as a dependency.
+                    dep["nodes"][var] += 1
+                    dep["ref_nodes"][var] = dep["nodes"][var]
+                    # The var of this call is a free_var
+                    is_free_var = True
+            # if the var of this call is a free_var, recreate it and give it a fixed input name.
+            if is_free_var:
+                need_update = True
+                new_args.append(relay.var(f"data_n_{new_input_idx}", var.checked_type))
+                new_input_idx += 1
+            else:
+                new_args.append(var)
+        # if the 'tvm.relay.expr.Call' has a free_var, recreate it with new name as 'data_n_*'.
+        if need_update:
+            value = tvm.relay.expr.Call(
+                value.op, new_args, value.attrs, value.type_args, value.span
+            )
+        return value, snode_dep, new_input_idx
+
+    def merge_constant_expr(constant_expr, expr):
+        # merge constant express with a express
+        if not isinstance(constant_expr.body, tvm.relay.expr.Let):
+            return tvm.relay.expr.Let(constant_expr.var, constant_expr.value, expr)
+
+        return tvm.relay.expr.Let(
+            constant_expr.var, constant_expr.value, merge_constant_expr(constant_expr.body, expr)
+        )
+
+    def _recursion(anf, pipeline_mods, split_conf, constant_expr):
+        # Enumurate all operators of compute graph, then split the compute graph into a group of
+        # subgraph.
+        nonlocal operator_index_map
+        nonlocal new_input_idx
+        nonlocal snode_dep
+        cur_node_dep = snode_dep[len(snode_dep) - 1]
+        if isinstance(anf, tvm.relay.Function):
+            return tvm.relay.Function(
+                anf.params,
+                _recursion(anf.body, pipeline_mods, split_conf, constant_expr),
+                anf.ret_type,
+                anf.type_params,
+                anf.attrs,
+            )
+        if isinstance(anf, tvm.relay.expr.Let):
+            value = anf.value
+            # record the constant expr to make sure all sugraphs can find correct constant.
+            if isinstance(value, tvm.relay.expr.Constant):
+                if not constant_expr:
+                    constant_expr = tvm.relay.expr.Let(anf.var, value, anf.var)
+                else:
+                    constant_expr = tvm.relay.expr.Let(anf.var, value, constant_expr)
+            if isinstance(value, tvm.relay.expr.Call):
+                new_args = []
+                # build current var list
+                cur_node_dep["nodes"][anf.var] = 0
+                # Get the dependency information of the nodes.
+                value, snode_dep, new_input_idx = parse_dependency(value, snode_dep, new_input_idx)
+                if isinstance(value.op, tvm.ir.Op):
+                    if value.op.name in operator_index_map:
+                        operator_index_map[value.op.name] += 1
+                    else:
+                        operator_index_map[value.op.name] = 0
+                    split_operator_name = split_conf[0]["op_name"] if split_conf else ""
+                    split_operator_index = split_conf[0]["op_index"] if split_conf else ""
+                    # if a operator name and repeating count in the network match with the values
+                    # of the 'split configuration', then this place is where we should do the
+                    # graph splitting.
+                    if (
+                        split_conf
+                        and split_operator_name in operator_index_map
+                        and operator_index_map[split_operator_name] >= split_operator_index
+                    ):
+                        # Do graph splitting.
+                        split_conf.pop(0)
+                        snode_dep.append({"nodes": {}, "ref_nodes": {}})
+                        ann = _recursion(
+                            anf.body,
+                            pipeline_mods,
+                            split_conf,
+                            constant_expr,
+                        )
+                        snode_dep.pop()
+                        dep_vars = get_dep_var(snode_dep)
+                        # When the nodes of the current subgraph are the depedency node of another
+                        # subgraph, we need to set them as the output of current subgraph.
+                        body = relay.Tuple(dep_vars) if len(dep_vars) > 1 else anf.var
+                        # when the operator of current subgraph uses previous subgraph constant
+                        # as the argument of a "relay.expr.call", such constant may become a free
+                        # varaible if the constant does not exist in the current subgraph.
+                        # merge the previous constant with current subgraph to avoid such issue.
+                        if constant_expr:
+                            ann = merge_constant_expr(constant_expr, ann)
+                        ann = run_opt_pass(ann, transform.ToGraphNormalForm())
+                        mod = tvm.IRModule.from_expr(ann)
+                        pipeline_mods.insert(0, mod)
+                        # Return the last node of the current subgraph.
+                        return tvm.relay.expr.Let(anf.var, value, body)
+            return tvm.relay.expr.Let(
+                anf.var,
+                value,
+                _recursion(anf.body, pipeline_mods, split_conf, constant_expr),
+            )
+        else:
+            return anf
+
+    snode_dep = [{"nodes": {}, "ref_nodes": {}}]
+    pipeline_mods = []
+    operator_index_map = {}
+    # Used to tracking new input which caused by graph splitting.
+    new_input_idx = 0
+    constant_expr = None
+    subgraph_split_conf = split_conf.copy()
+    # Binding the parameters.
+    if params:
+        expr = build_module.bind_params_by_name(expr, params)
+    anf = run_opt_pass(expr, transform.ToANormalForm())
+    anf = run_opt_pass(anf, transform.InferType())
+    ann = _recursion(
+        anf,
+        pipeline_mods,
+        subgraph_split_conf,
+        constant_expr,
+    )
+    ann = run_opt_pass(ann.body, transform.ToGraphNormalForm())
+    mod = tvm.IRModule.from_expr(ann)
+    pipeline_mods.insert(0, mod)
+    return pipeline_mods
+
+
+def get_network():
+    # Get a list of modules representing subgraphs.
+    mods = []
+    dshape = (3, 3)
+    data = relay.var("data_0", relay.TensorType(dshape, "float32"))
+    data21 = relay.var("data_1", relay.TensorType(dshape, "float32"))
+    data_net1_output_1 = relay.var("data_0", relay.TensorType(dshape, "float32"))
+    data_net1_output_2 = relay.var("data_1", relay.TensorType(dshape, "float32"))
+    data_net2_output_1 = relay.var("data_0", relay.TensorType(dshape, "float32"))
+    mvalue1 = np.full((1), 1).astype("float32")
+    mvalue2 = np.full((1), 2).astype("float32")
+    mvalue3 = np.full((1), 3).astype("float32")
+    mv1 = relay.Constant(tvm.nd.array(mvalue1))
+    mv2 = relay.Constant(tvm.nd.array(mvalue2))
+    mv3 = relay.Constant(tvm.nd.array(mvalue3))
+    # There are three outputs in the first model.
+    net1_output1 = relay.add(data, mv1)
+    net1_output2 = relay.subtract(data, mv2)
+    net1_output3 = relay.concatenate((net1_output1, net1_output2), axis=0)
+    (net1_output3, _) = relay.split(net1_output3, indices_or_sections=2, axis=0)
+    net1_output3 = relay.add(net1_output3, mv2)
+    # The second model uses the output named net1_output3 of the first model as the first input,
+    # the second input of the second model is data21.
+    net2 = relay.add(net1_output3, mv2)
+    net2 = relay.add(net2, data21)
+    net2_output = relay.add(net2, mv3)
+    # The third model uses the output named net2_output of the second model as the first input
+    # and uses the output named net1_output2 of the first model as the second input.
+    net3 = relay.multiply(net2_output, mv3)
+    net3 = relay.add(net3, net1_output2)
+    return tvm.IRModule.from_expr(relay.Function([data, data21], relay.Tuple([net3]))), dshape
+
+
+def get_split_mod():
+    mod, dshape = get_network()
+    split_conf = [{"op_name": "add", "op_index": 1}, {"op_name": "add", "op_index": 4}]
+    mods = graph_split(mod["main"], split_conf)
+    return mods, dshape
+
+
 def get_mannual_mod():
     # Get a list of modules representing subgraphs.
     mods = []
@@ -83,9 +266,8 @@ def get_manual_conf(mods, target):
         "mod_idx": 0,
         "cpu_affinity": "0",
         "output": [
-            {"output_idx": 0, "dependencies": [{"mod_idx": 1, "input_name": "data_0"}]},
-            {"output_idx": 1, "dependencies": [{"mod_idx": 2, "input_name": "data_0"}]},
-            {"output_idx": 2, "dependencies": [{"global_output_index": 0}]},
+            {"output_idx": 0, "dependencies": [{"mod_idx": 1, "input_name": "data_n_0"}]},
+            {"output_idx": 1, "dependencies": [{"mod_idx": 2, "input_name": "data_n_2"}]},
         ],
     }
     mod_config[mods[0]] = {
@@ -103,7 +285,7 @@ def get_manual_conf(mods, target):
         "mod_idx": 1,
         "cpu_affinity": "0",
         "output": [
-            {"output_idx": 0, "dependencies": [{"mod_idx": 2, "input_name": "data_1"}]},
+            {"output_idx": 0, "dependencies": [{"mod_idx": 2, "input_name": "data_n_1"}]},
         ],
     }
     mod_config[mods[1]] = {
@@ -120,7 +302,7 @@ def get_manual_conf(mods, target):
     pipe_config3 = {
         "mod_idx": 2,
         "cpu_affinity": "0",
-        "output": [{"output_idx": 0, "dependencies": [{"global_output_index": 1}]}],
+        "output": [{"output_idx": 0, "dependencies": [{"global_output_index": 0}]}],
     }
     mod_config[mods[2]] = {
         "pipeline": pipe_config3,
@@ -222,7 +404,7 @@ def test_pipe_runtime_error_check():
     # This function is used to trigger runtime error by applying wrong logic.
     if pipeline_executor_build.pipeline_executor_build_enabled():
         # Get three pipeline modules here.
-        (mod1, mod2, mod3), dshape = get_mannual_mod()
+        (mod1, mod2, mod3), dshape = get_split_mod()
 
         # The input or output name is illegal and expects a runtime error.
         pipe_error = pipeline_executor_build.PipelineConfig()
@@ -283,7 +465,7 @@ def test_pipeline():
         for target in target_list:
             affinity = os.sched_getaffinity(0)
             # Get the three pipeline modules here.
-            (mod1, mod2, mod3), dshape = get_mannual_mod()
+            (mod1, mod2, mod3), dshape = get_split_mod()
 
             # Prepare batch data for pipeline computation.
             datas = []
@@ -305,33 +487,29 @@ def test_pipeline():
             pipe_config["input"]["data_b"].connect(pipe_config[mod2]["input"]["data_1"])
 
             # The mod1 output[0] will be connected to a input named "data_0" of mod2.
-            pipe_config[mod1]["output"][0].connect(pipe_config[mod2]["input"]["data_0"])
+            pipe_config[mod1]["output"][0].connect(pipe_config[mod2]["input"]["data_n_0"])
 
             # The mod1 output[1] will be connected to a input named "data_0" of mod3.
-            pipe_config[mod1]["output"][1].connect(pipe_config[mod3]["input"]["data_0"])
+            pipe_config[mod1]["output"][1].connect(pipe_config[mod3]["input"]["data_n_2"])
 
             # The mod2 output[2] will be connected to a input named "data_1" of mod3.
-            pipe_config[mod2]["output"][0].connect(pipe_config[mod3]["input"]["data_1"])
-
-            # The mod1 output[2] will be connected to pipeline output[0].
-            pipe_config[mod1]["output"][2].connect(pipe_config["output"]["0"])
+            pipe_config[mod2]["output"][0].connect(pipe_config[mod3]["input"]["data_n_1"])
 
-            # The mod3 output[0] will be connected to pipeline output[1].
-            pipe_config[mod3]["output"][0].connect(pipe_config["output"]["1"])
-            # Print configueration (print(pipe_config)), the result looks like following.
+            # The mod3 output[0] will be connected to pipeline output[0].
+            pipe_config[mod3]["output"][0].connect(pipe_config["output"]["0"])
+            # Print configuration (print(pipe_config)), the result looks like following.
             #
             # Inputs
             #   |data_a: mod1:data_0
             #   |data_b: mod2:data_1
             #
             # output
-            #   |output(1) : mod1.output(2)
-            #   |output(2) : mod3.output(0)
+            #   |output(1) : mod3.output(0)
             #
             # connections
-            #   |mod1.output(0)-> mod2.data_0
-            #   |mod1.output(1)-> mod3.data_0
-            #   |mod2.output(0)-> mod3.data_1
+            #   |mod1.output(0)-> mod2.data_n_0
+            #   |mod1.output(1)-> mod3.data_n_2
+            #   |mod2.output(0)-> mod3.data_n_1
 
             # Set other parameters.
             pipe_config[mod1].target = target[0]
@@ -367,7 +545,7 @@ def test_pipeline():
 
             # Use the import function to create and initialize PipelineModule.
             pipeline_module_test = pipeline_executor.PipelineModule.load_library(config_file_name)
-            assert pipeline_module_test.num_outputs == 2
+            assert pipeline_module_test.num_outputs == 1
 
             input_map = pipeline_module_test.get_input_pipeline_map("data_b")
             assert input_map[0] == "1" and input_map[1] == "data_1"

From a6a34046c432b3766e7c32bbd85c098812a12a68 Mon Sep 17 00:00:00 2001
From: Jiawei Liu <jaway.liu@gmail.com>
Date: Thu, 19 May 2022 23:45:25 -0500
Subject: [PATCH 39/59] fix vec*mat in PyTorch converter (#11347)

* fix vec*mat in PyTorch converter

* Trigger CI
---
 python/tvm/relay/frontend/pytorch.py          | 2 ++
 tests/python/frontend/pytorch/test_forward.py | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index dc5938931ed04..3887b40141c7f 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -1698,6 +1698,8 @@ def matmul(self, inputs, input_types):
             return output
         elif len(a_shape) > 2:
             inputs_0 = _op.reshape(inputs_0, [-1, a_shape[-1]])
+        elif len(a_shape) == 1:
+            return _op.squeeze(_op.nn.matmul(_op.expand_dims(inputs_0, axis=0), inputs_1), axis=[0])
 
         if len(b_shape) > 2:
             trans_axes = list(range(len(b_shape)))
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 1abd59dce8113..642beb015fecd 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -3511,6 +3511,11 @@ def forward(self, *args):
     tensor2 = torch.randn(4)
     verify_model(MatMul1().float().eval(), input_data=[tensor1, tensor2])
 
+    # vector x matrix
+    tensor1 = torch.randn(4)
+    tensor2 = torch.randn(4, 3)
+    verify_model(MatMul1().float().eval(), input_data=[tensor1, tensor2])
+
     # matrix x matrix
     tensor1 = torch.randn(10, 4)
     tensor2 = torch.randn(4, 10)

From 7e99d30d63a0c20eedc247c723e2318686b815cf Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Fri, 20 May 2022 17:50:32 +0900
Subject: [PATCH 40/59] [PTX] Intrinsics for async copy from global to shared
 (SM80) (#11368)

* registor ptx builtin for async copy

* add basic codegen

* add test

* update codegen

* wip

* codegen bug fixed, test working

* add commit group

* add doc
---
 include/tvm/tir/builtin.h                     | 19 +++++
 src/target/source/codegen_cuda.cc             | 12 ++++
 src/target/source/ptx.cc                      | 26 +++++++
 src/target/source/ptx.h                       | 13 ++++
 src/tir/op/builtin.cc                         |  9 +++
 .../python/unittest/test_tir_ptx_cp_async.py  | 70 +++++++++++++++++++
 6 files changed, 149 insertions(+)
 create mode 100644 tests/python/unittest/test_tir_ptx_cp_async.py

diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index b166b16b77219..f33432645cc3c 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -632,6 +632,25 @@ TVM_DLL const Op& ptx_mma_sp();
  */
 TVM_DLL const Op& ptx_ldmatrix();
 
+/*!
+ * \brief tvm intrinsics for ptx async copy from global to shared memory
+ *
+ * void ptx_cp_async(Var shared_ptr, Expr shared_offset, Var global_ptr, Expr global_offset, size_t
+ * bytes);
+ *
+ */
+TVM_DLL const Op& ptx_cp_async();
+
+/*!
+ * \brief tvm intrinsics for ptx async copy commit and wait.
+ *
+ * void ptx_commit_group();
+ * void ptx_wait_group(int num);
+ *
+ */
+TVM_DLL const Op& ptx_commit_group();
+TVM_DLL const Op& ptx_wait_group();
+
 // TODO(tvm-team) replace the usage of the vector operations by Shuffle.
 /*!
  * \brief Get the high level half of the vector
diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
index d4ec536fb0012..7459d4c250baa 100644
--- a/src/target/source/codegen_cuda.cc
+++ b/src/target/source/codegen_cuda.cc
@@ -821,6 +821,18 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
     std::string smem_elem_offset = this->PrintExpr(op->args[6]);
     this->stream << PrintLoadMatrixAssembly(trans, num, type, local_ptr, local_elem_offset,
                                             smem_ptr, smem_elem_offset);
+  } else if (op->op.same_as(builtin::ptx_cp_async())) {
+    std::string dst = this->PrintExpr(op->args[0]);
+    std::string dst_offset = this->PrintExpr(op->args[1]);
+    std::string src = this->PrintExpr(op->args[2]);
+    std::string src_offset = this->PrintExpr(op->args[3]);
+    std::string size = this->PrintExpr(op->args[4]);
+    this->stream << PrintCpAsyncAssembly(dst, dst_offset, src, src_offset, size);
+  } else if (op->op.same_as(builtin::ptx_commit_group())) {
+    this->stream << "__asm__ __volatile__(\"cp.async.commit_group;\");\n\n";
+  } else if (op->op.same_as(builtin::ptx_wait_group())) {
+    std::string N = this->PrintExpr(op->args[0]);
+    this->stream << "__asm__ __volatile__(\"cp.async.wait_group " + N + ";\");\n\n";
   } else {
     CodeGenC::VisitExpr_(op, os);
   }
diff --git a/src/target/source/ptx.cc b/src/target/source/ptx.cc
index 02a98ffbbabd3..71c68baed6dcc 100644
--- a/src/target/source/ptx.cc
+++ b/src/target/source/ptx.cc
@@ -638,5 +638,31 @@ std::string PrintLoadMatrixAssembly(bool trans, int num, const std::string& type
   return asm_code;
 }
 
+std::string PrintCpAsyncAssembly(const std::string& shared_ptr,
+                                 const std::string& shared_elem_offset,
+                                 const std::string& global_ptr,
+                                 const std::string& global_elem_offset, const std::string& bytes) {
+  std::string asm_code = R"(
+  {
+    unsigned int addr;
+    __asm__ __volatile__(
+      "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
+      : "=r"(addr)
+      : "l"((void *)({smem_addr}))
+    );
+    __asm__ __volatile__(
+      "cp.async.cg.shared.global [%0], [%1], %2;"
+       :: "r"(addr), "l"((void*)({global_ptr})), "n"({bytes})
+    );
+  }
+)";
+  Replacer replacer;
+  replacer.register_rule("{smem_addr}", shared_ptr + " + " + shared_elem_offset);
+  replacer.register_rule("{global_ptr}", global_ptr + " + " + global_elem_offset);
+  replacer.register_rule("{bytes}", bytes);
+  asm_code = replacer.rewrite(asm_code);
+  return asm_code;
+}
+
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/source/ptx.h b/src/target/source/ptx.h
index c4255d737ad05..c811a1b9c1d6b 100644
--- a/src/target/source/ptx.h
+++ b/src/target/source/ptx.h
@@ -79,6 +79,19 @@ std::string PrintLoadMatrixAssembly(bool trans, int num, const std::string& type
                                     const std::string& smem_ptr,
                                     const std::string& smem_elem_offset);
 
+/*!
+ * \brief Print ptx cp.async assembly string given parameters.
+ * \param shared_ptr: The pointer to the destination shared memory.
+ * \param shared_elem_offset: The offset into the shared memory.
+ * \param global_ptr: The pointer to the global memory.
+ * \param global_elem_offset: The offset into the global memory.
+ * \param bytes: The number of bytes to copy, valid values are 4, 8, and 16.
+ */
+std::string PrintCpAsyncAssembly(const std::string& shared_ptr,
+                                 const std::string& shared_elem_offset,
+                                 const std::string& global_ptr,
+                                 const std::string& global_elem_offset, const std::string& bytes);
+
 }  // namespace codegen
 }  // namespace tvm
 
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index 4e8d83dd32df3..0415d1bbec9e8 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -247,6 +247,15 @@ TIR_DEFINE_BUILTIN_FUNC(ptx_mma_sp)
 TIR_DEFINE_BUILTIN_FUNC(ptx_ldmatrix)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_BUILTIN_FUNC(ptx_cp_async)
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_BUILTIN_FUNC(ptx_commit_group)
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_BUILTIN_FUNC(ptx_wait_group)
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_BUILTIN_FUNC(vectorhigh)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kPure));
 
diff --git a/tests/python/unittest/test_tir_ptx_cp_async.py b/tests/python/unittest/test_tir_ptx_cp_async.py
new file mode 100644
index 0000000000000..17b60885509f8
--- /dev/null
+++ b/tests/python/unittest/test_tir_ptx_cp_async.py
@@ -0,0 +1,70 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+from tvm.script import tir as T
+import numpy as np
+import tvm.testing
+
+
+@T.prim_func
+def ptx_cp_async(A: T.Buffer[(32, 128), "float16"], B: T.Buffer[(32, 128), "float16"]) -> None:
+    T.func_attr({"global_symbol": "default_function", "tir.noalias": True})
+    bx = T.env_thread("blockIdx.x")
+    tx = T.env_thread("threadIdx.x")
+    T.launch_thread(bx, 1)
+    T.launch_thread(tx, 32)
+    with T.block():
+        A_shared = T.alloc_buffer([32, 128], "float16", scope="shared")
+        T.reads(A[0:32, 0:128])
+        T.writes(B[0:32, 0:128])
+
+        for i in range(16):
+            T.evaluate(
+                T.ptx_cp_async(
+                    A_shared.data, tx * 128 + 8 * i, A.data, tx * 128 + 8 * i, 16, dtype="float16"
+                )
+            )
+
+        # TODO(masahi): Remove dtype requirement from TVMScript parser
+        T.evaluate(T.ptx_commit_group(dtype="float16"))
+        T.evaluate(T.ptx_wait_group(0, dtype="float16"))
+
+        for i in range(128):
+            B[tx, i] = A_shared[tx, i]
+
+
+@tvm.testing.requires_cuda
+def test_ptx_cp_async():
+    f = ptx_cp_async
+    arch = tvm.contrib.nvcc.get_target_compute_version()
+    major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
+    if major < 8:
+        # Require at least SM80
+        return
+
+    mod = tvm.build(f, target="cuda")
+    A_np = np.random.rand(32, 128).astype("float16")
+    B_np = np.zeros((32, 128)).astype("float16")
+    dev = tvm.cuda(0)
+    A_nd = tvm.nd.array(A_np, device=dev)
+    B_nd = tvm.nd.array(B_np, device=dev)
+    mod(A_nd, B_nd)
+    tvm.testing.assert_allclose(B_nd.numpy(), A_np)
+
+
+if __name__ == "__main__":
+    test_ptx_cp_async()

From 13272a19ef30b32c457a48b04dca72ed05aef784 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 20 May 2022 01:55:55 -0700
Subject: [PATCH 41/59] [ci] Disable flaky onnx tests (#11376)

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/python/frontend/onnx/test_forward.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 6fac7f2f20aa6..d6f96f0d07960 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -6038,6 +6038,7 @@ def verify_qlinearmul(a_shape, b_shape, c_shape):
     verify_qlinearmul([5, 1, 7], [2, 7], [5, 2, 7])
 
 
+@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/11375")
 @tvm.testing.parametrize_targets
 def test_qlinearleakyrelu(target, dev):
     def verify_qlinearleakyrelu(inshape, kwargs):
@@ -6063,6 +6064,7 @@ def verify_qlinearleakyrelu(inshape, kwargs):
     verify_qlinearleakyrelu([5, 1, 4, 6], {"alpha": 0.65})
 
 
+@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/11375")
 @tvm.testing.parametrize_targets
 def test_qlinearsigmoid(target, dev):
     def verify_qlinearsigmoid(a_shape):

From 909851c2f5d66337a2897b6a9fb2b2f786bfa917 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 20 May 2022 01:56:23 -0700
Subject: [PATCH 42/59] [ci][easy] Fix parameters for macros (#11377)

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile            | 4 ++--
 jenkins/Jenkinsfile.j2 | 2 +-
 jenkins/macros.j2      | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 024b920ac676e..dbbbb29f79726 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-19T11:41:58.421857
+// Generated at 2022-05-19T14:04:32.815769
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -524,7 +524,7 @@ def cpp_unittest(image) {
 
 def add_microtvm_permissions() {
   sh(
-    script: 'find build/microtvm_template_projects -type f | xargs chmod +x',
+    script: 'find build/microtvm_template_projects -type f | grep qemu-hack | xargs chmod +x',
     label: 'Add execute permissions for microTVM files',
   )
 }
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 8742d07244857..9eac881c549a4 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -447,7 +447,7 @@ def cpp_unittest(image) {
 def add_microtvm_permissions() {
   {% for folder in microtvm_template_projects %}
   sh(
-    script: 'find {{ folder }} -type f | xargs chmod +x',
+    script: 'find {{ folder }} -type f | grep qemu-hack | xargs chmod +x',
     label: 'Add execute permissions for microTVM files',
   )
   {% endfor %}
diff --git a/jenkins/macros.j2 b/jenkins/macros.j2
index 2ce005a128efb..ce29aa2d580df 100644
--- a/jenkins/macros.j2
+++ b/jenkins/macros.j2
@@ -90,7 +90,7 @@
   },
 {% endmacro %}
 
-{% macro upload_artifacts(tag, filenames, folders=[]) %}
+{% macro upload_artifacts(tag, filenames, folders=None) %}
 sh(
             script: """
               set -eux

From 3248793dd8043e8fd68a4d2d104d61f1f0e71f61 Mon Sep 17 00:00:00 2001
From: Andrew Cheung <43327640+ninehusky@users.noreply.github.com>
Date: Fri, 20 May 2022 02:03:24 -0700
Subject: [PATCH 43/59] Add Conv3D bindings (#11381)

---
 rust/tvm/src/ir/relay/attrs/nn.rs | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/rust/tvm/src/ir/relay/attrs/nn.rs b/rust/tvm/src/ir/relay/attrs/nn.rs
index c9d291113303d..040939d4f6c14 100644
--- a/rust/tvm/src/ir/relay/attrs/nn.rs
+++ b/rust/tvm/src/ir/relay/attrs/nn.rs
@@ -75,6 +75,25 @@ pub struct Conv2DAttrsNode {
     pub out_dtype: DataType,
 }
 
+#[repr(C)]
+#[derive(Object, Debug)]
+#[ref_name = "Conv3DAttrs"]
+#[type_key = "relay.attrs.Conv3DAttrs"]
+pub struct Conv3DAttrsNode {
+    pub base: BaseAttrsNode,
+    pub strides: Array<IndexExpr>,
+    pub padding: Array<IndexExpr>,
+    pub dilation: Array<IndexExpr>,
+    pub groups: i32,
+    pub channels: IndexExpr,
+    pub kernel_size: Array<IndexExpr>,
+    pub data_layout: TString,
+    pub kernel_layout: TString,
+    pub out_layout: TString,
+    pub auto_scheduler_rewritten_layout: TString,
+    pub out_dtype: DataType,
+}
+
 #[repr(C)]
 #[derive(Object, Debug)]
 #[ref_name = "BiasAddAttrs"]

From 07d91fa04182e77887b379c9644778c2a1a92999 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Fri, 20 May 2022 02:12:55 -0700
Subject: [PATCH 44/59] Fix function number datatype from char to uint16_t
 (#11365)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fix function number datatype from char to uint16_t

rewrite the modified part to pass lint check

Use 2 bytes for func num in fun_registry

Fix errors in linter

Add the declaration of the helper functions

set 2 bytes for func num in func_registry test units

pass num_func by value

This commit change the datatype of the number of the function from 1 Byte to 2 Bytes.
Besides, I use some helper functions to access the number of function and the first function name.

* Fix aot_executor_module to unbreak CI.

* Fix GraphExecutorModule.

* Remove graph_json_to_c_func_registry.

 * No longer needed and not called anywhere.
 * Superseded by emitting the FuncRegistry directly in codegen.

Co-authored-by: 嚴中璟 <a1245967@gmail.com>
---
 include/tvm/runtime/crt/func_registry.h       | 27 ++++++-
 python/tvm/micro/func_registry.py             | 79 -------------------
 .../aot_executor_module/aot_executor_module.c |  2 +-
 src/runtime/crt/common/func_registry.c        | 39 ++++++---
 .../graph_executor_module.c                   |  2 +-
 src/target/func_registry_generator.cc         |  8 +-
 tests/crt/func_registry_test.cc               |  7 +-
 7 files changed, 68 insertions(+), 96 deletions(-)
 delete mode 100644 python/tvm/micro/func_registry.py

diff --git a/include/tvm/runtime/crt/func_registry.h b/include/tvm/runtime/crt/func_registry.h
index 4f8a19af591e8..50737f8717987 100644
--- a/include/tvm/runtime/crt/func_registry.h
+++ b/include/tvm/runtime/crt/func_registry.h
@@ -42,7 +42,7 @@ typedef struct TVMFuncRegistry {
   /*! \brief Names of registered functions, concatenated together and separated by \0.
    * An additional \0 is present at the end of the concatenated blob to mark the end.
    *
-   * Byte 0 is the number of functions in `funcs`.
+   * Byte 0 and 1 are the number of functions in `funcs`.
    */
   const char* names;
 
@@ -50,6 +50,31 @@ typedef struct TVMFuncRegistry {
   const TVMBackendPackedCFunc* funcs;
 } TVMFuncRegistry;
 
+/*!
+ * \brief Get the of the number of functions from registry.
+ *
+ * \param reg TVMFunctionRegistry instance that contains the function.
+ * \return The number of functions from registry.
+ */
+uint16_t TVMFuncRegistry_GetNumFuncs(const TVMFuncRegistry* reg);
+
+/*!
+ * \brief Set the number of functions to registry.
+ *
+ * \param reg TVMFunctionRegistry instance that contains the function.
+ * \param num_funcs The number of functions
+ * \return 0 when successful.
+ */
+int TVMFuncRegistry_SetNumFuncs(const TVMFuncRegistry* reg, const uint16_t num_funcs);
+
+/*!
+ * \brief Get the address of 0th function from registry.
+ *
+ * \param reg TVMFunctionRegistry instance that contains the function.
+ * \return the address of 0th function from registry
+ */
+const char* TVMFuncRegistry_Get0thFunctionName(const TVMFuncRegistry* reg);
+
 /*!
  * \brief Get packed function from registry by name.
  *
diff --git a/python/tvm/micro/func_registry.py b/python/tvm/micro/func_registry.py
deleted file mode 100644
index 69c4bb1a29e50..0000000000000
--- a/python/tvm/micro/func_registry.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Defines functions to work with TVMModule FuncRegistry."""
-
-import json
-
-
-def graph_json_to_c_func_registry(graph_path, func_registry_path):
-    """Convert a graph json file to a CRT-compatible FuncRegistry.
-
-    Parameters
-    ----------
-    graph_path : str
-        Path to the graph JSON file.
-
-    func_registry_path : str
-        Path to a .c file which will be written containing the function registry.
-    """
-    with open(graph_path) as json_f:
-        graph = json.load(json_f)
-
-    funcs = []
-    for n in graph["nodes"]:
-        if n["op"] != "tvm_op":
-            continue
-
-        funcs.append(n["attrs"]["func_name"])
-
-    encoded_funcs = f"\\{len(funcs):03o}" + "\\0".join(funcs)
-    lines = [
-        "#include <tvm/runtime/c_runtime_api.h>",
-        "#include <tvm/runtime/crt/module.h>",
-        "#include <stdio.h>",
-        "",
-    ]
-
-    for f in funcs:
-        lines.append(
-            f"extern int {f}(TVMValue* args, int* type_codes, int num_args, "
-            "TVMValue* out_ret_value, int* out_ret_tcode, void* resource_handle);"
-        )
-
-    lines.append("static TVMBackendPackedCFunc funcs[] = {")
-
-    for f in funcs:
-        lines.append(f"    (TVMBackendPackedCFunc) &{f},")
-
-    lines += [
-        "};",
-        "static const TVMFuncRegistry system_lib_registry = {",
-        f'       "{encoded_funcs}\\0",',
-        "        funcs,",
-        "};",
-        "static const TVMModule system_lib = {",
-        "    &system_lib_registry,",
-        "};",
-        "",
-        "const TVMModule* TVMSystemLibEntryPoint(void) {",
-        "    return &system_lib;",
-        "}",
-        "",  # blank line to end the file
-    ]
-    with open(func_registry_path, "w") as wrapper_f:
-        wrapper_f.write("\n".join(lines))
diff --git a/src/runtime/crt/aot_executor_module/aot_executor_module.c b/src/runtime/crt/aot_executor_module/aot_executor_module.c
index d4b3755c13142..e1dbd533a3ecd 100644
--- a/src/runtime/crt/aot_executor_module/aot_executor_module.c
+++ b/src/runtime/crt/aot_executor_module/aot_executor_module.c
@@ -176,7 +176,7 @@ static const TVMBackendPackedCFunc aot_executor_registry_funcs[] = {
 };
 
 static const TVMFuncRegistry aot_executor_registry = {
-    "\x0aget_input\0"
+    "\x0a\0get_input\0"
     "get_input_index\0"
     "get_input_info\0"
     "get_num_inputs\0"
diff --git a/src/runtime/crt/common/func_registry.c b/src/runtime/crt/common/func_registry.c
index 116a5c496f1bd..49cef8fd70eb3 100644
--- a/src/runtime/crt/common/func_registry.c
+++ b/src/runtime/crt/common/func_registry.c
@@ -60,14 +60,29 @@ int strcmp_cursor(const char** cursor, const char* name) {
   return return_value;
 }
 
+uint16_t TVMFuncRegistry_GetNumFuncs(const TVMFuncRegistry* reg) {
+  uint16_t num_funcs;
+  memcpy(&num_funcs, reg->names, sizeof(num_funcs));
+  return num_funcs;
+}
+
+int TVMFuncRegistry_SetNumFuncs(const TVMFuncRegistry* reg, const uint16_t num_funcs) {
+  memcpy((char*)reg->names, &num_funcs, sizeof(num_funcs));
+  return 0;
+}
+
+const char* TVMFuncRegistry_Get0thFunctionName(const TVMFuncRegistry* reg) {
+  // NOTE: first function name starts at index 2 to skip num_funcs.
+  return (reg->names + sizeof(uint16_t));
+}
+
 tvm_crt_error_t TVMFuncRegistry_Lookup(const TVMFuncRegistry* reg, const char* name,
                                        tvm_function_index_t* function_index) {
   tvm_function_index_t idx;
-  const char* reg_name_ptr;
+  const char* reg_name_ptr = TVMFuncRegistry_Get0thFunctionName(reg);
 
   idx = 0;
-  // NOTE: reg_name_ptr starts at index 1 to skip num_funcs.
-  for (reg_name_ptr = reg->names + 1; *reg_name_ptr != '\0'; reg_name_ptr++) {
+  for (; *reg_name_ptr != '\0'; reg_name_ptr++) {
     if (!strcmp_cursor(&reg_name_ptr, name)) {
       *function_index = idx;
       return kTvmErrorNoError;
@@ -82,9 +97,9 @@ tvm_crt_error_t TVMFuncRegistry_Lookup(const TVMFuncRegistry* reg, const char* n
 tvm_crt_error_t TVMFuncRegistry_GetByIndex(const TVMFuncRegistry* reg,
                                            tvm_function_index_t function_index,
                                            TVMBackendPackedCFunc* out_func) {
-  uint8_t num_funcs;
+  uint16_t num_funcs;
 
-  num_funcs = reg->names[0];
+  num_funcs = TVMFuncRegistry_GetNumFuncs(reg);
   if (function_index >= num_funcs) {
     return kTvmErrorFunctionIndexInvalid;
   }
@@ -101,7 +116,8 @@ tvm_crt_error_t TVMMutableFuncRegistry_Create(TVMMutableFuncRegistry* reg, uint8
 
   reg->registry.names = (const char*)buffer;
   buffer[0] = 0;  // number of functions present in buffer.
-  buffer[1] = 0;  // end of names list marker.
+  buffer[1] = 0;  // note that we combine the first two elements to form a 16-bit function index.
+  buffer[2] = 0;  // end of names list marker.
 
   // compute a guess of the average size of one entry:
   //  - assume average function name is around ~10 bytes
@@ -117,13 +133,12 @@ tvm_crt_error_t TVMMutableFuncRegistry_Create(TVMMutableFuncRegistry* reg, uint8
 tvm_crt_error_t TVMMutableFuncRegistry_Set(TVMMutableFuncRegistry* reg, const char* name,
                                            TVMBackendPackedCFunc func, int override) {
   size_t idx;
-  char* reg_name_ptr;
+  char* reg_name_ptr = (char*)TVMFuncRegistry_Get0thFunctionName(&(reg->registry));
 
   idx = 0;
   // NOTE: safe to discard const qualifier here, since reg->registry.names was set from
   // TVMMutableFuncRegistry_Create above.
-  // NOTE: reg_name_ptr starts at index 1 to skip num_funcs.
-  for (reg_name_ptr = (char*)reg->registry.names + 1; *reg_name_ptr != 0; reg_name_ptr++) {
+  for (; *reg_name_ptr != 0; reg_name_ptr++) {
     if (!strcmp_cursor((const char**)&reg_name_ptr, name)) {
       if (override == 0) {
         return kTvmErrorFunctionAlreadyDefined;
@@ -149,7 +164,11 @@ tvm_crt_error_t TVMMutableFuncRegistry_Set(TVMMutableFuncRegistry* reg, const ch
   reg_name_ptr += name_len + 1;
   *reg_name_ptr = 0;
   ((TVMBackendPackedCFunc*)reg->registry.funcs)[idx] = func;
-  ((char*)reg->registry.names)[0]++;  // increment num_funcs.
+
+  uint16_t num_funcs;
+  // increment num_funcs.
+  num_funcs = TVMFuncRegistry_GetNumFuncs(&(reg->registry)) + 1;
+  TVMFuncRegistry_SetNumFuncs(&(reg->registry), num_funcs);
 
   return kTvmErrorNoError;
 }
diff --git a/src/runtime/crt/graph_executor_module/graph_executor_module.c b/src/runtime/crt/graph_executor_module/graph_executor_module.c
index 280130a994140..0ae12f5a9e0a1 100644
--- a/src/runtime/crt/graph_executor_module/graph_executor_module.c
+++ b/src/runtime/crt/graph_executor_module/graph_executor_module.c
@@ -229,7 +229,7 @@ static const TVMBackendPackedCFunc graph_executor_registry_funcs[] = {
 };
 
 static const TVMFuncRegistry graph_executor_registry = {
-    "\x08get_input\0"
+    "\x08\0get_input\0"
     "get_input_index\0"
     "get_input_info\0"
     "get_num_inputs\0"
diff --git a/src/target/func_registry_generator.cc b/src/target/func_registry_generator.cc
index 7c948d50cbb94..d679bf379b628 100644
--- a/src/target/func_registry_generator.cc
+++ b/src/target/func_registry_generator.cc
@@ -31,7 +31,13 @@ namespace target {
 
 std::string GenerateFuncRegistryNames(const Array<String>& function_names) {
   std::stringstream ss;
-  ss << (unsigned char)(function_names.size());
+
+  unsigned char function_nums[sizeof(uint16_t)];
+  *reinterpret_cast<uint16_t*>(function_nums) = function_names.size();
+  for (auto f : function_nums) {
+    ss << f;
+  }
+
   for (auto f : function_names) {
     ss << f << '\0';
   }
diff --git a/tests/crt/func_registry_test.cc b/tests/crt/func_registry_test.cc
index 9f0e7f8d1a5aa..5962a3acee39f 100644
--- a/tests/crt/func_registry_test.cc
+++ b/tests/crt/func_registry_test.cc
@@ -82,7 +82,7 @@ TEST(StrCmpScan, Test) {
 }
 
 TEST(FuncRegistry, Empty) {
-  TVMFuncRegistry registry{"\000", NULL};
+  TVMFuncRegistry registry{"\000\000", NULL};
 
   EXPECT_EQ(kTvmErrorFunctionNameNotFound, TVMFuncRegistry_Lookup(&registry, "foo", NULL));
   EXPECT_EQ(kTvmErrorFunctionIndexInvalid,
@@ -101,7 +101,7 @@ static int Bar(TVMValue* args, int* type_codes, int num_args, TVMValue* out_ret_
 }
 
 // Matches the style of registry defined in generated C modules.
-const char* kBasicFuncNames = "\002Foo\0Bar\0";  // NOTE: final \0
+const char* kBasicFuncNames = "\002\000Foo\0Bar\0";  // NOTE: final \0
 const TVMBackendPackedCFunc funcs[2] = {&Foo, &Bar};
 const TVMFuncRegistry kConstRegistry = {kBasicFuncNames, (const TVMBackendPackedCFunc*)funcs};
 
@@ -111,7 +111,8 @@ TEST(FuncRegistry, ConstGlobalRegistry) {
 
   // Foo
   EXPECT_EQ(kBasicFuncNames[0], 2);
-  EXPECT_EQ(kBasicFuncNames[1], 'F');
+  EXPECT_EQ(kBasicFuncNames[1], 0);
+  EXPECT_EQ(kBasicFuncNames[2], 'F');
   EXPECT_EQ(kTvmErrorNoError, TVMFuncRegistry_Lookup(&kConstRegistry, "Foo", &func_index));
   EXPECT_EQ(0, func_index);
 

From c8d22837055d97b2a06b585f0ae2ac5e8269a11d Mon Sep 17 00:00:00 2001
From: xndcn <xndchn@gmail.com>
Date: Fri, 20 May 2022 17:13:38 +0800
Subject: [PATCH 45/59] Fix array pointers releasing with `delete` operator
 (#11328)

It may be safe to release POD-types array with `delete`
operator, but `delete[]` is always better.
---
 src/contrib/tf_op/tvm_dso_op_kernels.cc | 2 +-
 src/target/metadata.h                   | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/contrib/tf_op/tvm_dso_op_kernels.cc b/src/contrib/tf_op/tvm_dso_op_kernels.cc
index fb483ee6f2e05..78c10e4822c8d 100644
--- a/src/contrib/tf_op/tvm_dso_op_kernels.cc
+++ b/src/contrib/tf_op/tvm_dso_op_kernels.cc
@@ -207,7 +207,7 @@ class TVMDSOOpTrait<GPUDevice> {
     tensorflow::int64* dims = new tensorflow::int64[num_dims];
     cudaMemcpy(dims, flat, sizeof(tensorflow::int64) * num_dims, cudaMemcpyDeviceToHost);
     tensorflow::TensorShapeUtils::MakeShape(dims, num_dims, output_shape);
-    delete dims;
+    delete[] dims;
   }
 };
 #endif
diff --git a/src/target/metadata.h b/src/target/metadata.h
index 5dc1c9d0eec5b..426e8616070ae 100644
--- a/src/target/metadata.h
+++ b/src/target/metadata.h
@@ -134,11 +134,11 @@ class InMemoryMetadataNode : public ::tvm::target::metadata::VisitableMetadataNo
   }
 
  private:
-  ::std::unique_ptr<struct TVMTensorInfo> inputs_;
+  ::std::unique_ptr<struct TVMTensorInfo[]> inputs_;
   std::vector<::tvm::runtime::metadata::TensorInfo> inputs_objs_;
-  ::std::unique_ptr<struct TVMTensorInfo> outputs_;
+  ::std::unique_ptr<struct TVMTensorInfo[]> outputs_;
   std::vector<::tvm::runtime::metadata::TensorInfo> outputs_objs_;
-  ::std::unique_ptr<struct TVMTensorInfo> pools_;
+  ::std::unique_ptr<struct TVMTensorInfo[]> pools_;
   std::vector<::tvm::runtime::metadata::TensorInfo> pools_objs_;
   ::std::string mod_name_;
   struct ::TVMMetadata storage_;
@@ -186,7 +186,7 @@ class InMemoryTensorInfoNode : public ::tvm::target::metadata::VisitableTensorIn
 
  private:
   ::std::string name_;
-  ::std::unique_ptr<int64_t> shape_;
+  ::std::unique_ptr<int64_t[]> shape_;
   struct ::TVMTensorInfo storage_;
 };
 

From c216cbec5bb795a8b13bdb1e177b523e4f7e4ca8 Mon Sep 17 00:00:00 2001
From: ChunPing Chung <cpchung@pllab.cs.nthu.edu.tw>
Date: Fri, 20 May 2022 17:14:25 +0800
Subject: [PATCH 46/59] [Bugfix] Fix qnn.quantize type func with incomplete
 type (#11124)

---
 src/relay/qnn/op/quantize.cc | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/relay/qnn/op/quantize.cc b/src/relay/qnn/op/quantize.cc
index 06a73ee91cbf5..da33aaac81873 100644
--- a/src/relay/qnn/op/quantize.cc
+++ b/src/relay/qnn/op/quantize.cc
@@ -55,8 +55,23 @@ bool QuantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   axis = (axis < 0) ? ((rank > 0) ? data->shape.size() + axis : 0) : axis;
 
   // If zero point and scale are scalar then axis doesnt matter.
-  bool scale_is_scalar = (types[1].as<TensorTypeNode>())->shape.size() == 0;
-  bool zp_is_scalar = (types[2].as<TensorTypeNode>())->shape.size() == 0;
+  bool scale_is_scalar, zp_is_scalar;
+
+  if (auto ttype = types[1].as<TensorTypeNode>()) {
+    scale_is_scalar = ttype->shape.size() == 0;
+  } else {
+    ICHECK(types[1].as<IncompleteTypeNode>())
+        << "Quantize: expect to be TensorType but get " << types[1];
+    return false;
+  }
+
+  if (auto ttype = types[2].as<TensorTypeNode>()) {
+    zp_is_scalar = ttype->shape.size() == 0;
+  } else {
+    ICHECK(types[2].as<IncompleteTypeNode>())
+        << "Quantize: expect to be TensorType but get " << types[2];
+    return false;
+  }
 
   if (!(scale_is_scalar && zp_is_scalar)) {
     ICHECK_LT(axis, rank > 0 ? rank : 1) << "axis " << quantize_attrs->axis << " is out of range";

From 01b472f4d05584a669dfe2d7378fdaeeb76be378 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Sat, 21 May 2022 01:25:04 +0900
Subject: [PATCH 47/59] [CI] Update CPU and GPU image (#11369)

---
 Jenkinsfile            | 6 +++---
 jenkins/Jenkinsfile.j2 | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index dbbbb29f79726..7b8c8f890db15 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,13 +45,13 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-19T14:04:32.815769
+// Generated at 2022-05-20T18:06:10.772162
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:20220513-055910-fa834f67e'
-ci_gpu = 'tlcpack/ci-gpu:20220513-055910-fa834f67e'
-ci_cpu = 'tlcpack/ci-cpu:20220517-094028-de21c8f2e'
+ci_gpu = 'tlcpack/ci-gpu:20220519-055908-ddfa1da69'
+ci_cpu = 'tlcpack/ci-cpu:20220519-055908-ddfa1da69'
 ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e'
 ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e'
 ci_qemu = 'tlcpack/ci-qemu:20220517-094028-de21c8f2e'
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 9eac881c549a4..b00ee0272626c 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -52,8 +52,8 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:20220513-055910-fa834f67e'
-ci_gpu = 'tlcpack/ci-gpu:20220513-055910-fa834f67e'
-ci_cpu = 'tlcpack/ci-cpu:20220517-094028-de21c8f2e'
+ci_gpu = 'tlcpack/ci-gpu:20220519-055908-ddfa1da69'
+ci_cpu = 'tlcpack/ci-cpu:20220519-055908-ddfa1da69'
 ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e'
 ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e'
 ci_qemu = 'tlcpack/ci-qemu:20220517-094028-de21c8f2e'

From 72a5219aad7c9b807169f74f8954580a36c1d85e Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 20 May 2022 12:20:12 -0500
Subject: [PATCH 48/59] [Schedule] Allowed typing.Tuple in
 tir.schedule._type_checker (#11289)

* [Schedule] Allowed typing.Tuple in tir.schedule._type_checker

Previously, `typing.Tuple` annotations could not be used with
`tir.schedule._type_checker.type_checked` annotations.  This allows
`Tuple` type annotations to be type-checked.

* Revert change, allow tuples input as List arguments

* Suppress mypy errors

Directly interacting with a type object would otherwise cause some
false positives.

* Corrected unit test for allowing tuples to be used as typing.List

* Represent multi-type lists as List[Union[...]] instead of List[Any]

This gives a better error message and plays nicely with _type2str,
since `typing.Any` doesn't have a `__name__` field.
---
 python/tvm/tir/schedule/_type_checker.py      |  49 ++++++-
 .../unittest/test_type_annotation_checker.py  | 121 ++++++++++++++++++
 2 files changed, 169 insertions(+), 1 deletion(-)
 create mode 100644 tests/python/unittest/test_type_annotation_checker.py

diff --git a/python/tvm/tir/schedule/_type_checker.py b/python/tvm/tir/schedule/_type_checker.py
index 1b86c4aa30db2..21ca0c5a922b7 100644
--- a/python/tvm/tir/schedule/_type_checker.py
+++ b/python/tvm/tir/schedule/_type_checker.py
@@ -41,6 +41,13 @@ def list_(type_: Any) -> Any:
                 return [subtype]
             return None
 
+        @staticmethod
+        def tuple_(type_: Any) -> Optional[List[type]]:
+            if _Subtype._origin(type_) is tuple:
+                subtypes = type_.__args__
+                return subtypes
+            return None
+
         @staticmethod
         def optional(type_: Any) -> Optional[List[type]]:
             if _Subtype._origin(type_) is Union:
@@ -68,6 +75,14 @@ def list_(type_: Any) -> Optional[List[type]]:
                     return [subtype]
             return None
 
+        @staticmethod
+        def tuple_(type_: Any) -> Optional[List[type]]:
+            if isinstance(type_, typing.GenericMeta):  # type: ignore # pylint: disable=no-member
+                if type_.__name__ == "Tuple":
+                    subtypes = type_.__args__  # type: ignore # pylint: disable=no-member
+                    return subtypes
+            return None
+
         @staticmethod
         def optional(type_: Any) -> Optional[List[type]]:
             if isinstance(type_, typing._Union):  # type: ignore # pylint: disable=no-member,protected-access
@@ -93,6 +108,10 @@ def _dispatcher(type_: Any) -> Tuple[str, List[type]]:
     if subtype is not None:
         return "list", subtype
 
+    subtype = _Subtype.tuple_(type_)
+    if subtype is not None:
+        return "tuple", subtype
+
     subtype = _Subtype.optional(type_)
     if subtype is not None:
         return "optional", subtype
@@ -108,6 +127,7 @@ def _dispatcher(type_: Any) -> Tuple[str, List[type]]:
     "none": lambda: "None",
     "atomic": lambda t: str(t.__name__),
     "list": lambda t: f"List[{_type2str(t)}]",
+    "tuple": lambda *t: f"Tuple[{', '.join([_type2str(x) for x in t])}]",
     "optional": lambda t: f"Optional[{_type2str(t)}]",
     "union": lambda *t: f"Union[{', '.join([_type2str(x) for x in t])}]",
 }
@@ -118,11 +138,26 @@ def _type2str(type_: Any) -> str:
     return _TYPE2STR[key](*subtypes)
 
 
+def _val2type(value: Any):
+    if isinstance(value, list):
+        types = set(_val2type(x) for x in value)
+        if len(types) == 1:
+            return List[types.pop()]  # type: ignore
+
+        return List[Union[tuple(types)]]  # type: ignore
+
+    if isinstance(value, tuple):
+        types = tuple(_val2type(x) for x in value)  # type: ignore
+        return Tuple[types]
+
+    return type(value)
+
+
 def _type_check_err(x: Any, name: str, expected: Any) -> str:
     return (
         f'"{name}" has wrong type. '
         f'Expected "{_type2str(expected)}", '
-        f'but gets: "{_type2str(type(x))}"'
+        f'but gets: "{_type2str(_val2type(x))}"'
     )
 
 
@@ -142,6 +177,17 @@ def _type_check_list(v: List[Any], name: str, type_: Any) -> Optional[str]:
                 return error_msg
         return None
 
+    def _type_check_tuple(v: Any, name: str, *types: Any) -> Optional[str]:
+        if not isinstance(v, tuple):
+            return _type_check_err(v, name, Tuple[types])
+        if len(types) != len(v):
+            return _type_check_err(v, name, Tuple[types])
+        for i, (x, type_) in enumerate(zip(v, types)):
+            error_msg = _type_check(x, f"{name}[{i}]", type_)
+            if error_msg is not None:
+                return error_msg
+        return None
+
     def _type_check_optional(v: Any, name: str, type_: Any) -> Optional[str]:
         return None if v is None else _type_check(v, name, type_)
 
@@ -156,6 +202,7 @@ def _type_check_union(v: Any, name: str, *types: Any) -> Optional[str]:
         "none": _type_check_none,
         "atomic": _type_check_atomic,
         "list": _type_check_list,
+        "tuple": _type_check_tuple,
         "optional": _type_check_optional,
         "union": _type_check_union,
     }
diff --git a/tests/python/unittest/test_type_annotation_checker.py b/tests/python/unittest/test_type_annotation_checker.py
new file mode 100644
index 0000000000000..7317e05b1a755
--- /dev/null
+++ b/tests/python/unittest/test_type_annotation_checker.py
@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test type checker based on python's type annotations"""
+
+from typing import List, Tuple
+
+import pytest
+
+from tvm.tir.schedule._type_checker import type_checked
+
+
+test_cases = [
+    {
+        "type_annotation": int,
+        "positive_cases": [5],
+        "negative_cases": ["5"],
+    },
+    {
+        "type_annotation": List[int],
+        "positive_cases": [
+            [5],
+            [],
+            # Tuples are allowed to be used as lists, because both are
+            # represented in FFI as tvm::runtime::Array.
+            (1, 2, 3),
+        ],
+        "negative_cases": [
+            None,
+            5,
+            ["5"],
+        ],
+    },
+    {
+        "type_annotation": Tuple[int],
+        "positive_cases": [
+            (5,),
+        ],
+        "negative_cases": [
+            None,
+            (1, 2, 3),
+            [1],
+            5,
+            ["5"],
+        ],
+    },
+    {
+        "type_annotation": Tuple[str, int],
+        "positive_cases": [
+            ("x", 5),
+        ],
+        "negative_cases": [
+            42,
+            ("x", 5, 6),
+            ("x", 5, "y"),
+            ("x", 5.0),
+            (None, 5),
+        ],
+    },
+]
+
+positive_cases = [
+    (config["type_annotation"], case) for config in test_cases for case in config["positive_cases"]
+]
+
+negative_cases = [
+    (config["type_annotation"], case) for config in test_cases for case in config["negative_cases"]
+]
+
+
+def format_name(type_annotation, case):
+    try:
+        name = type_annotation.__name__
+    except AttributeError:
+        name = str(type_annotation).replace("typing.", "")
+
+    return f"{name}_{case}"
+
+
+@pytest.mark.parametrize(
+    ["type_annotation", "case"],
+    positive_cases,
+    ids=[format_name(t, c) for t, c in positive_cases],
+)
+def test_matches_type(type_annotation, case):
+    @type_checked
+    def func(_: type_annotation):
+        pass
+
+    func(case)
+
+
+@pytest.mark.parametrize(
+    ["type_annotation", "case"],
+    negative_cases,
+    ids=[format_name(t, c) for t, c in negative_cases],
+)
+def test_not_matches(type_annotation, case):
+    @type_checked
+    def func(_: type_annotation):
+        pass
+
+    with pytest.raises(TypeError):
+        func(case)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))

From febae407edc0dbc0add23474fb36c29b618f3b4e Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 20 May 2022 11:20:35 -0700
Subject: [PATCH 49/59] [docs] Add lightweight docs image (#11045)

* [docs] Add lightweight docs image

This image includes everything necessary to build the docs without any tutorials and is just about 1.5 GB which is significantly less than the CPU/GPU images.

* remove ci.py docs --cpu flag, imply it via a lack of --tutorials/--full so it is the default

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 docker/Dockerfile.docs | 77 ++++++++++++++++++++++++++++++++++++++++++
 docker/build.sh        |  7 +++-
 tests/scripts/ci.py    | 33 ++++++------------
 3 files changed, 93 insertions(+), 24 deletions(-)
 create mode 100644 docker/Dockerfile.docs

diff --git a/docker/Dockerfile.docs b/docker/Dockerfile.docs
new file mode 100644
index 0000000000000..840094b4d0cbc
--- /dev/null
+++ b/docker/Dockerfile.docs
@@ -0,0 +1,77 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+FROM ubuntu:18.04
+
+# Base scripts
+RUN apt-get update --fix-missing
+
+COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
+RUN bash /install/ubuntu_install_core.sh
+
+COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
+RUN bash /install/ubuntu1804_install_python.sh
+
+# Globally disable pip cache
+RUN pip config set global.no-cache-dir false
+
+COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
+RUN bash /install/ubuntu_install_python_package.sh
+
+COPY install/ubuntu_install_sphinx.sh /install/ubuntu_install_sphinx.sh
+RUN bash /install/ubuntu_install_sphinx.sh
+
+# Enable doxygen for c++ doc build
+RUN apt-get update && apt-get install -y doxygen libprotobuf-dev protobuf-compiler
+
+COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh
+RUN bash /install/ubuntu_install_java.sh
+
+COPY install/ubuntu_install_nodejs.sh /install/ubuntu_install_nodejs.sh
+RUN bash /install/ubuntu_install_nodejs.sh
+
+# Rust env (build early; takes a while)
+COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
+RUN bash /install/ubuntu_install_rust.sh
+ENV RUSTUP_HOME /opt/rust
+ENV CARGO_HOME /opt/rust
+ENV PATH $PATH:$CARGO_HOME/bin
+
+# sccache
+COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
+RUN bash /install/ubuntu_install_sccache.sh
+
+RUN rm -rf /opt/rust \
+    /usr/lib/x86_64-linux-gnu/libopenblas* \
+    /usr/lib/jvm/java-11* \
+    /usr/lib/x86_64-linux-gnu/libLLVM-6.0.so.1
+
+# Environment variables
+ENV PATH=/usr/local/nvidia/bin:${PATH}
+ENV PATH=/usr/local/cuda/bin:${PATH}
+ENV CPLUS_INCLUDE_PATH=/usr/local/cuda/include:${CPLUS_INCLUDE_PATH}
+ENV C_INCLUDE_PATH=/usr/local/cuda/include:${C_INCLUDE_PATH}
+ENV LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/compat:${LIBRARY_PATH}
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/compat:${LD_LIBRARY_PATH}
+
+# Ensure the local libcuda have higher priority than the /usr/local/cuda/compact
+# since the compact libcuda does not work on non-Tesla gpus
+ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu/:${LD_LIBRARY_PATH}
+
+ENV LD_LIBRARY_PATH=/opt/rocm/lib:${LD_LIBRARY_PATH}
+ENV PATH=/node_modules/.bin:${PATH}
+ENV VULKAN_SDK=/usr
diff --git a/docker/build.sh b/docker/build.sh
index ed67b638c79b3..75f0e35c6c7ba 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -22,7 +22,7 @@
 #
 # Usage: build.sh <CONTAINER_TYPE> [--tag <DOCKER_IMAGE_TAG>]
 #                [--dockerfile <DOCKERFILE_PATH>] [-it]
-#                [--net=host] [--cache-from <IMAGE_NAME>]
+#                [--net=host] [--cache-from <IMAGE_NAME>] [--cache]
 #                [--name CONTAINER_NAME] [--context-path <CONTEXT_PATH>]
 #                [--spec DOCKER_IMAGE_SPEC]
 #                [<COMMAND>]
@@ -99,6 +99,11 @@ if [[ "$1" == "--cache-from" ]]; then
     shift 1
 fi
 
+if [[ "$1" == "--cache" ]]; then
+    shift 1
+    DOCKER_NO_CACHE_ARG=
+fi
+
 if [[ "$1" == "--context-path" ]]; then
     DOCKER_CONTEXT_PATH="$2"
     echo "Using custom context path: ${DOCKER_CONTEXT_PATH}"
diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index d45c3b1ae9cb2..b2b903ad01b1e 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -141,14 +141,6 @@ def check_gpu():
         )
 
 
-def check_build():
-    if (REPO_ROOT / "build").exists():
-        warnings.append(
-            "Existing build dir found may be interfering with the Docker "
-            "build (you may need to remove it)"
-        )
-
-
 def gen_name(s: str) -> str:
     # random 4 letters
     suffix = "".join([random.choice(string.ascii_lowercase) for i in range(5)])
@@ -227,38 +219,33 @@ def docker(name: str, image: str, scripts: List[str], env: Dict[str, str], inter
 def docs(
     tutorial_pattern: Optional[str] = None,
     full: bool = False,
-    cpu: bool = False,
     interactive: bool = False,
     skip_build: bool = False,
     docker_image: Optional[str] = None,
 ) -> None:
     """
     Build the documentation from gallery/ and docs/. By default this builds only
-    the Python docs.
+    the Python docs without any tutorials.
 
     arguments:
-    full -- Build all language docs, not just Python
-    precheck -- Run Sphinx precheck script
-    tutorial-pattern -- Regex for which tutorials to execute when building docs (can also be set via TVM_TUTORIAL_EXEC_PATTERN)
-    cpu -- Run with the ci-cpu image and use CMake defaults for building TVM (if no GPUs are available)
+    full -- Build all language docs, not just Python (this will use the 'ci_gpu' Docker image)
+    tutorial-pattern -- Regex for which tutorials to execute when building docs (this will use the 'ci_gpu' Docker image)
     skip_build -- skip build and setup scripts
     interactive -- start a shell after running build / test scripts
     docker-image -- manually specify the docker image to use
     """
-    config = "./tests/scripts/task_config_build_gpu.sh"
     build_dir = get_build_dir("gpu")
-    if cpu and full:
-        clean_exit("--full cannot be used with --cpu")
 
     extra_setup = []
     image = "ci_gpu" if docker_image is None else docker_image
-    if cpu:
+    if not full and tutorial_pattern is None:
+        # TODO: Change this to tlcpack/docs once that is uploaded
         image = "ci_cpu" if docker_image is None else docker_image
         build_dir = get_build_dir("cpu")
-        config = " && ".join(
+        config_script = " && ".join(
             [
-                "mkdir -p build",
-                "pushd build",
+                f"mkdir -p {build_dir}",
+                f"pushd {build_dir}",
                 "cp ../cmake/config.cmake .",
                 # The docs import tvm.micro, so it has to be enabled in the build
                 "echo set\(USE_MICRO ON\) >> config.cmake",
@@ -287,9 +274,10 @@ def docs(
         ]
     else:
         check_gpu()
+        config_script = f"./tests/scripts/task_config_build_gpu.sh {build_dir}"
 
     scripts = extra_setup + [
-        config + f" {build_dir}",
+        config_script,
         f"./tests/scripts/task_build.py --build-dir {build_dir}",
     ]
 
@@ -307,7 +295,6 @@ def docs(
         "IS_LOCAL": "1",
         "TVM_LIBRARY_PATH": str(REPO_ROOT / build_dir),
     }
-    check_build()
     docker(name=gen_name("docs"), image=image, scripts=scripts, env=env, interactive=interactive)
 
 

From 0274d8e1f124cecc159abf3234251bf010784581 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Sat, 21 May 2022 03:33:54 +0900
Subject: [PATCH 50/59] [TIR] Support tensorization using ldmatrix + MMA
 (#11355)

* [TIR] Support tensorization using ldmatrix + MMA

commit 3218facf100b0dfc55715acfd1cee156764129ba
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 18 14:04:56 2022 +0900

    some clean up

commit 7a235b69dc2023b3098ed44d591edb63b20a8f4e
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 18 13:55:11 2022 +0900

    parameterize over storage scope in mma store intrin

commit 827ea4c434c35607b241f8e0ae2efe3214ac2458
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 18 13:37:38 2022 +0900

    properly handle floordiv/mod in codegen

commit 42d4c6f42182c9fd79566c0955f99cc82abd5144
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 18 09:53:57 2022 +0900

    update tuned factors for fp16

commit 328d0aa36b2ea9ea1b051970d612bff82d2d20e6
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 18 08:43:30 2022 +0900

    all tests working

commit 5e086cf5fd1404ac38f85c4bfbe692687b45a16c
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 18 07:48:43 2022 +0900

    add doc for mma_fill and mma_store intrin

commit 4f945c4116b6d3bdc965ecb2be2229bb46dc11ab
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 18 06:39:01 2022 +0900

    remove tests

commit df7708f7f67761d9c18f9564bc15abd50c12ac69
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue May 17 19:52:14 2022 +0900

    unified test

commit 754c83eeb8510b31fb9652b089177f9b8e642ec0
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue May 17 19:36:24 2022 +0900

    clean up LowerWarpmemory

commit 178c3dcee7bfa17d5d93fec02aa858dc62151670
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue May 17 19:15:04 2022 +0900

    Use IndexMap

commit 07fb58910338c62847fd902b37801d09b8c673b0
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue May 17 17:51:44 2022 +0900

    remove 16x8x8 test

commit 2b05b5a5470ac221d559f31a31a8e2ff753b2414
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue May 17 17:31:35 2022 +0900

    generate mma fill/store

commit bf23fc50f0ffa99e875d9247ca66acec0c36677f
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue May 17 12:23:30 2022 +0900

    mma intrin generation with meta programming

commit 5afb5f00afd642cb1e39872edc7965f476dcdcb7
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue May 17 05:26:14 2022 +0900

    ldmatrix intrin generation with meta programming

commit fb62abb3424b88ec48c697e306e05889a3ac306f
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon May 16 20:30:49 2022 +0900

    minor

commit 5a80adce24e84d3ec6bf931b60cb9c730d243394
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon May 16 19:55:57 2022 +0900

    revert some change

commit e599a55078ee75f2480a721098341812db58cf6f
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon May 16 19:54:18 2022 +0900

    remove obsolete files

commit 4b13b85ff91d0d592a7e0c01924e0b49b82f35a8
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon May 16 19:51:21 2022 +0900

    wip

commit 848de63455539e25cd0d43e5a65fd048636ef0f7
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon May 16 19:44:29 2022 +0900

    wip

commit b35bff97ed10c22559e2164eb7538db0f711ce7e
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon May 16 19:31:18 2022 +0900

    update parse error msg

commit ad9b053ef865b1f91f03d7b15ed7aae3420ee213
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon May 16 19:26:51 2022 +0900

    fix for avoiding Buffer.vload(...) case

commit 54c686443e370edbfae860d0809b1b6182d26414
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon May 16 18:59:55 2022 +0900

    wip

commit 078060fe28d22f1db5f07b1c382dee438f02df60
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon May 16 18:57:34 2022 +0900

    wip

commit 576f8415e65e0e8a8a7808885e219b3b53867950
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon May 16 18:52:15 2022 +0900

    wip

commit 12a376ae2f44aa6660121e64e0358f2866624f7f
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon May 16 17:54:58 2022 +0900

    Squashed commit of the following:

    commit 48eef4981d1a55aaf3b0ac935f2a10347cb1ac2d
    Author: Masahiro Masuda <masahi129@gmail.com>
    Date:   Mon May 16 17:40:48 2022 +0900

        more comment

    commit 8f67fc87038834e9f7e2c5cd3dfe61fabf442206
    Author: Masahiro Masuda <masahi129@gmail.com>
    Date:   Mon May 16 17:11:27 2022 +0900

        update test

    commit ad85036621c005b733763e67ceffae39c356ec99
    Author: Masahiro Masuda <masahi129@gmail.com>
    Date:   Mon May 16 16:54:01 2022 +0900

        add test

    commit 4a5dc3ffd5d0bb4a1700e57897c9e0f26e3d2a88
    Author: Masahiro Masuda <masahi129@gmail.com>
    Date:   Mon May 16 16:40:47 2022 +0900

        [TVMScript] Support function call to help construct AST

commit 76c1bcf0ade45d7433a0066236add8372b1cc547
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon May 16 16:30:07 2022 +0900

    simplify iterator in layout transform

commit 936280324ea2c91429a6a85a1b8ee89c7b825928
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat May 14 11:31:39 2022 +0900

    remove obsolet files

commit 2e119b422d72d726d5f2bd20fe48a1e62fcb0510
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat May 14 10:43:59 2022 +0900

    calculate mma store dst index using inverse affine map

commit 9489434ee52b546e2abb2ab28173eefd51525ba4
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat May 14 10:01:12 2022 +0900

    simplify store

commit 1adcb77b8bba8e5d91080fe6cbfc7add7f4365c2
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat May 14 09:43:40 2022 +0900

    simplified fill

commit 7b13c736d23e0eac94137aa918101d788e60d4f3
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat May 14 09:22:17 2022 +0900

    simplify intrin desc using index map function

commit bcf212dda0f94c51f55c48921f61d92fd3b83777
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat May 14 07:16:42 2022 +0900

    seems to work

commit dd8ccf9ec2e48100158152e5d4590d141424e2e2
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat May 14 07:11:57 2022 +0900

    poking with the parser

commit 596582cbfbd08ebe23ea71aaf7a447472415ccd1
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri May 13 20:04:59 2022 +0900

    16x8x32 4k trans working

commit 273f89a8a6ac34f7c79147563922d34d44bffd08
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri May 13 19:52:13 2022 +0900

    add 16x8x16 fp16 trans

commit 8e2066cc4c6e86616bc9751324e63ba81a3b02af
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri May 13 19:32:37 2022 +0900

    16x8x16 4k trans working

commit c2d0744051733e94f840d4517bcee9ca5d444c75
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri May 13 19:25:52 2022 +0900

    16x8x16 trans working

commit c2e314cdda1c3a931781e51a863901ea178dffec
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri May 13 16:19:32 2022 +0900

    tuned int8 4k, 91 TOPS

commit 94d9d965f19ff1a2ebdd342079ef420fb537b16a
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri May 13 15:59:33 2022 +0900

    int8 4k tune working

commit 3ca8ca02593aff7540c9655aa831348246171752
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri May 13 08:43:57 2022 +0900

    mma 16x8x32 int8 working with ldmatrix b workaround

commit 54f1cb731d4b42a6cbc08baf144e74646400eef5
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri May 13 18:23:27 2022 +0900

    wip

commit 9d2844db602dc65af4dbd06a73fdd815f486b8b9
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri May 13 16:38:53 2022 +0900

    test tensorize without layout transform

commit 86ee6dabc801aeb8d6917bec6de97b42025dbdd1
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri May 13 15:15:34 2022 +0900

    int8 4k tensorize works

commit 39f9e32c9a64222c91daba2c32969b27207a31d2
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri May 13 12:44:39 2022 +0900

    begin int8 4k tune

commit 6fa91e55b5ab2ba0f901d0d35be1b2fb3ab092b0
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu May 12 18:53:20 2022 +0900

    try fix ldmatrix b for int8

commit 7a962cddc4799fa3df0c0fdf3c056146d3f2cbdf
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu May 12 18:28:34 2022 +0900

    fixed warp_coeff

commit a0afb5698f307382147a38819e004a2db7f554b1
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu May 12 12:20:01 2022 +0900

    wip

commit f70ccd09b07d5325454ffdc39a7619ea84aa7e06
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu May 12 12:09:57 2022 +0900

    int8 tensorize working

commit 20321fa4674dabc78fe55b5e0e2876c35b245d21
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu May 12 07:06:22 2022 +0900

    starting 16x8x32 int8

commit 441fd193c59cdc436d87ab35896cbb8c779ddf35
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu May 12 05:50:46 2022 +0900

    adding fp16 accum case

commit c9d40b69b1b57bfaddffba09ea07624ae90ee465
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 11 17:04:29 2022 +0900

    clean up

commit 5b2d48635e762c77c824d1c259ac8bcbcc949421
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 11 16:38:19 2022 +0900

    16x8x16 4k tune working

commit c3cb170d85600d03da5c3f4cda03552208ca0b8c
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 11 16:20:27 2022 +0900

    tensoriz fixed

commit 68039b081efcdd6aea1d132940b3745f50164974
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 11 15:55:25 2022 +0900

    begin 16x8x16 4k tune

commit ced5d8d980cc267d4735957c25cb60d71ae977d2
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 11 15:50:11 2022 +0900

    16x8x16 worked

commit 3d2c90d77c1bb2df2193e9af6cbaa2bd927a26d8
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 11 15:47:26 2022 +0900

    fix

commit 403050b03ad6b4f0ee8d45088ffb324727bbae48
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 11 15:45:10 2022 +0900

    add 16x8x16 test

commit 18e8d73661c99cd1c83021063b41a457afcb1638
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 11 06:50:32 2022 +0900

    fixed mma store codegen for 16x8x16

commit ec81250561195705122bccb9a2372f71de68121f
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 11 04:25:25 2022 +0900

    add 16x8x16 mma store codegen

commit e08df2a62a4809bcd39782949283c16e7703aa5c
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 11 03:47:47 2022 +0900

    tensorized C_warp init

commit ae0678918929c1ceec73f2039467040c5bb7823b
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 11 03:06:06 2022 +0900

    mma store codegen working

commit deb4d6646cc93d4cdb4f2560ce723bee4d86e144
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue May 10 19:22:57 2022 +0900

    update lower warp memory

commit 71fe5fe465300705fa94f9544a2e1a5070de6e0d
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue May 10 09:01:42 2022 +0900

    tensorizing mma store

commit e80a1f148c47f2a3fac2363a733d8d4e2a2631d0
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu Apr 28 19:54:08 2022 +0900

    clean up

commit a9640f4b7c3c9f22b87ca74a61003438dfd8f992
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu Apr 28 19:40:55 2022 +0900

    add tunable 4k test, 36 TFLOPS

commit b9f7eae7041d1a9b3e434c331c874e8347e89dc4
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu Apr 28 18:01:08 2022 +0900

    fixed bug in LowerWarpMemory index splitting for ldmatrix

commit 00df30823f874910ed1ec1f74718100311764234
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed Apr 27 07:58:17 2022 +0900

    fixed missing reverse_compute_at

commit 93f9fe7e5f7ad16c8d0e6240c16c0281a0e97dec
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed Apr 27 06:55:12 2022 +0900

    add 4k test

commit 3689ef712aa4b282a4818fa2fa2e7e349c3a5eec
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed Apr 27 06:54:09 2022 +0900

    temp disable high dim base indices check in tensorize

commit 0c859c4f385ba0b6f9477b569b80cee80b5b7282
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue Apr 26 19:18:23 2022 +0900

    clean up

commit f6aadbfcfbd73c1667a6de7aedc5894232b8e750
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue Apr 26 19:13:09 2022 +0900

    Add 16x8x8 MMA + LDMatrix test

commit 4cf6b20c6ca415e967ab58d80e4a77c701ad7255
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue Apr 26 18:04:17 2022 +0900

    testing 16x8x8 ldmatrix tensoriation

* set measure_perf to False

* add requires_gpu decorator in tests, always test build on non-ampere

* skip cuda compile on old gpu
---
 include/tvm/tir/builtin.h                     |  27 +
 python/tvm/tir/tensor_intrin/__init__.py      |   1 +
 python/tvm/tir/tensor_intrin/cuda.py          | 469 ++++++++++++++++++
 src/target/source/codegen_cuda.cc             |  76 ++-
 src/tir/op/builtin.cc                         |   6 +
 src/tir/transforms/lower_warp_memory.cc       |  45 +-
 ...est_tir_schedule_tensorize_ldmatrix_mma.py | 422 ++++++++++++++++
 7 files changed, 1042 insertions(+), 4 deletions(-)
 create mode 100644 python/tvm/tir/tensor_intrin/cuda.py
 create mode 100644 tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py

diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index f33432645cc3c..5fc42392c3376 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -651,6 +651,33 @@ TVM_DLL const Op& ptx_cp_async();
 TVM_DLL const Op& ptx_commit_group();
 TVM_DLL const Op& ptx_wait_group();
 
+/*!
+ * \brief tvm intrinsic for storing the result of PTX MMA into a destination pointer.
+ *        For example, if each thread in a warp of size 32 has 4 elements from the result of
+ *        m16xn8xk16 MMA in its registers, this intrinsic can be used to store the result in a
+ *        16x8 region in shared or global memory.
+ *
+ *        There is no real PTX instruction that does that, but we want to hide details of
+ *        complex index manipulation behind this intrinsic to simplify TIR lowering passes (e.g.
+ *        LowerWarpMemory).
+ *
+ * void mma_store(IntImm m, IntImm n, Var dst_ptr, Var src_ptr, Expr src_offset, Var dst_stride);
+ */
+TVM_DLL const Op& mma_store();
+
+/*!
+ * \brief tvm intrinsic for zero-initalizing an MMA accumulation registor.
+ *        For example, if each thread in a warp of size 32 has 8 elements from the A matrix in
+ *        m16xn8xk16 MMA in its registers, this intrinsic can be used to zero-initialize its
+ *        4 accumulation registers.
+ *
+ *        There is no real PTX instruction that does that, but we introduce this intrinsic for the
+ *        same reason as mma_store above.
+ *
+ * void mma_fill(IntImm local_size, Var local_ptr, Expr offset);
+ */
+TVM_DLL const Op& mma_fill();
+
 // TODO(tvm-team) replace the usage of the vector operations by Shuffle.
 /*!
  * \brief Get the high level half of the vector
diff --git a/python/tvm/tir/tensor_intrin/__init__.py b/python/tvm/tir/tensor_intrin/__init__.py
index 4115c3b900709..a3b47ff6d5d75 100644
--- a/python/tvm/tir/tensor_intrin/__init__.py
+++ b/python/tvm/tir/tensor_intrin/__init__.py
@@ -20,3 +20,4 @@
 from .arm_cpu import *
 from .dot_product_common import *
 from .rocm import *
+from .cuda import *
diff --git a/python/tvm/tir/tensor_intrin/cuda.py b/python/tvm/tir/tensor_intrin/cuda.py
new file mode 100644
index 0000000000000..853a37735486b
--- /dev/null
+++ b/python/tvm/tir/tensor_intrin/cuda.py
@@ -0,0 +1,469 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,missing-function-docstring
+"""Intrinsics for tensorization on NVIDIA GPU."""
+from tvm.script import tir as T
+from .. import IntImm, Cast
+from ..._ffi import register_func
+from ...runtime import convert
+from .. import TensorIntrin
+
+
+def shared_16x16_to_ldmatrix_32x8_layout(i, j):
+    thread_id = 4 * (i % 8) + (j % 8) // 2
+    return thread_id, 4 * (j // 8) + (i // 8) * 2 + (j % 2)
+
+
+def shared_16x32_to_ldmatrix_32x16_layout(i, j):
+    thread_id = 4 * (i % 8) + (j % 16) // 4
+    return thread_id, 8 * (j // 16) + (i // 8) * 4 + j % 4
+
+
+def shared_32x16_to_ldmatrix_32x16_layout(i, j):
+    thread_id = (i % 4) + 4 * (j % 8)
+    return thread_id, 8 * (j // 8) + (i // 16) * 4 + i % 4
+
+
+@register_func("tir.index_map.shared_16x16_to_ldmatrix_32x8_layout")
+def index_map_shared_16x16_to_ldmatrix_32x8_layout(ind):
+    i, j = ind[0], ind[1]
+    thread_id, local_id = shared_16x16_to_ldmatrix_32x8_layout(i, j)
+    return convert([thread_id, local_id])
+
+
+lift = convert
+
+M_DIM = 16
+N_DIM = 16
+WARP_SIZE = 32
+HALF_WARP = WARP_SIZE // 2
+HALF_WARP_expr = lift(HALF_WARP)
+
+
+def get_ldmatrix_intrin(k_dim, dtype, is_b, transposed):
+    local_size = (M_DIM * k_dim) // WARP_SIZE
+    shared_offset = None
+    index_map = None
+
+    if transposed:
+        assert is_b, "Transposed A matrix not supported"
+
+    ldmatrix_col_major = is_b and not transposed
+
+    if k_dim == 16:
+        assert dtype == "float16"
+
+        index_map = shared_16x16_to_ldmatrix_32x8_layout
+
+        if transposed:
+            shared_offset = (
+                lambda tx, stride: stride * 8 * (tx // HALF_WARP_expr)
+                + stride * (tx % 8)
+                + 8 * ((tx % HALF_WARP_expr) // 8)
+            )
+        else:
+            shared_offset = lambda tx, stride: stride * (tx % HALF_WARP_expr) + 8 * (
+                tx // HALF_WARP_expr
+            )
+    else:
+        assert (
+            k_dim == 32 and dtype == "int8"
+        ), "Only k_dim == 16 (float16) or k_dim == 32 (int8) supported for now"
+
+        if ldmatrix_col_major:
+            index_map = shared_32x16_to_ldmatrix_32x16_layout
+            # A dummy offset, ldmatrix cannot be used for int8 + trans case.
+            # We still use the ldmatrix intrinsic, but lower it to a manual loop in the codegen.
+            # Only the stride information is required.
+            shared_offset = lambda _, stride: stride
+        elif is_b and transposed:
+            index_map = shared_16x32_to_ldmatrix_32x16_layout
+            shared_offset = (
+                lambda tx, stride: stride * 8 * (tx // HALF_WARP_expr)
+                + (tx % 8) * stride
+                + 16 * ((tx % HALF_WARP_expr) // 8)
+            )
+        else:
+            index_map = shared_16x32_to_ldmatrix_32x16_layout
+            shared_offset = lambda tx, stride: stride * (tx % 16) + 16 * (tx // 16)
+
+    assert index_map and shared_offset
+
+    if is_b and not transposed:
+        row_dim = k_dim
+        col_dim = M_DIM
+    else:
+        row_dim = M_DIM
+        col_dim = k_dim
+
+    shmem_shape = (row_dim, col_dim)
+
+    @T.prim_func
+    def ldmatrix_desc(warp_handle: T.handle, shared_handle: T.handle) -> None:
+        shared = T.match_buffer(
+            shared_handle, shmem_shape, dtype, align=128, offset_factor=16, scope="shared"
+        )
+        warp = T.match_buffer(
+            warp_handle, (WARP_SIZE, local_size), dtype, align=128, offset_factor=16, scope="warp"
+        )
+
+        with T.block("root"):
+            T.reads(shared[0:row_dim, 0:col_dim])
+            T.writes(warp[0:WARP_SIZE, 0:local_size])
+
+            for ax0, ax1 in T.grid(row_dim, col_dim):
+                with T.block("shared_warp"):
+                    v0, v1 = T.axis.remap("SS", [ax0, ax1])
+                    T.reads(shared[v0, v1])
+
+                    thread_id, local_id = index_map(v0, v1)
+                    T.writes(warp[thread_id, local_id])
+                    warp[thread_id, local_id] = shared[v0, v1]
+
+    @T.prim_func
+    def ldmatrix_impl(warp_handle: T.handle, shared_handle: T.handle) -> None:
+        s0 = T.var("int32")
+        s1 = T.var("int32")
+        shared = T.match_buffer(
+            shared_handle,
+            shmem_shape,
+            dtype,
+            align=128,
+            offset_factor=16,
+            scope="shared",
+            strides=[s0, s1],
+        )
+        warp = T.match_buffer(
+            warp_handle, (WARP_SIZE, local_size), dtype, align=128, offset_factor=16, scope="warp"
+        )
+
+        with T.block("root"):
+            T.reads(shared[0:row_dim, 0:col_dim])
+            T.writes(warp[0:WARP_SIZE, 0:local_size])
+            tx = T.env_thread("threadIdx.x")
+            T.launch_thread(tx, WARP_SIZE)
+
+            T.evaluate(
+                T.ptx_ldmatrix(
+                    ldmatrix_col_major,
+                    4,  # Always load 4 matrices
+                    ".b16",
+                    warp.data,
+                    warp.elem_offset + lift(local_size) * tx,
+                    shared.access_ptr("r"),
+                    shared_offset(tx, s0),
+                    dtype=dtype,
+                )
+            )
+
+    return ldmatrix_desc, ldmatrix_impl
+
+
+def get_mma_intrin(k_dim, out_dtype, b_transposed):
+    local_size = (M_DIM * k_dim) // WARP_SIZE
+    local_size_out = (M_DIM * N_DIM) // 32
+
+    index_map_C = shared_16x16_to_ldmatrix_32x8_layout
+
+    if k_dim == 16:
+        index_map_A = shared_16x16_to_ldmatrix_32x8_layout
+        index_map_B = shared_16x16_to_ldmatrix_32x8_layout
+        mma_prefix = "m16n8k16"
+    elif k_dim == 32 and b_transposed:
+        index_map_A = index_map_B = shared_16x32_to_ldmatrix_32x16_layout
+        mma_prefix = "m16n8k32"
+    elif k_dim == 32 and not b_transposed:
+        index_map_A = shared_16x32_to_ldmatrix_32x16_layout
+        index_map_B = shared_32x16_to_ldmatrix_32x16_layout
+        mma_prefix = "m16n8k32"
+    else:
+        assert False
+
+    out_dtype_abbrv = {"float16": "fp16", "float32": "fp32", "int32": "int32"}[out_dtype]
+
+    if out_dtype in ["float16", "float32"]:
+        in_dtype = "float16"
+        in_dtype_abbrv = "fp16"
+    else:
+        in_dtype = "int8"
+        in_dtype_abbrv = "int8"
+
+    def maybe_cast(v):
+        if out_dtype in ["float32", "int32"]:
+            return Cast(out_dtype, v)
+        return v
+
+    def maybe_swap(i, j):
+        if b_transposed:
+            return j, i
+        return i, j
+
+    @T.prim_func
+    def mma_sync_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(
+            a, (WARP_SIZE, local_size), in_dtype, align=128, offset_factor=16, scope="warp"
+        )
+        B = T.match_buffer(
+            b, (WARP_SIZE, local_size), in_dtype, align=128, offset_factor=16, scope="warp"
+        )
+        C = T.match_buffer(
+            c, (WARP_SIZE, local_size_out), out_dtype, align=128, offset_factor=16, scope="warp"
+        )
+
+        with T.block("root"):
+            T.reads(
+                C[0:WARP_SIZE, 0:local_size_out],
+                A[0:WARP_SIZE, 0:local_size],
+                B[0:WARP_SIZE, 0:local_size],
+            )
+            T.writes(C[0:WARP_SIZE, 0:local_size_out])
+
+            for i, j, k in T.grid(M_DIM, N_DIM, k_dim):
+                with T.block("C"):
+                    i, j, k = T.axis.remap("SSR", [i, j, k])
+                    b_row_ind, b_col_ind = maybe_swap(k, j)
+
+                    thread_id_C, local_id_C = index_map_C(i, j)
+                    thread_id_A, local_id_A = index_map_A(i, k)
+                    thread_id_B, local_id_B = index_map_B(b_row_ind, b_col_ind)
+
+                    T.reads(
+                        C[thread_id_C, local_id_C],
+                        A[thread_id_A, local_id_A],
+                        B[thread_id_B, local_id_B],
+                    )
+                    T.writes(C[thread_id_C, local_id_C])
+
+                    C[thread_id_C, local_id_C] += maybe_cast(
+                        A[thread_id_A, local_id_A]
+                    ) * maybe_cast(B[thread_id_B, local_id_B])
+
+    @T.prim_func
+    def mma_sync_impl(a: T.handle, b: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(
+            a, (WARP_SIZE, local_size), in_dtype, align=128, offset_factor=16, scope="warp"
+        )
+        B = T.match_buffer(
+            b, (WARP_SIZE, local_size), in_dtype, align=128, offset_factor=16, scope="warp"
+        )
+        C = T.match_buffer(
+            c, (WARP_SIZE, local_size_out), out_dtype, align=128, offset_factor=16, scope="warp"
+        )
+
+        with T.block("root"):
+            T.reads(
+                C[0:WARP_SIZE, 0:local_size_out],
+                A[0:WARP_SIZE, 0:local_size],
+                B[0:WARP_SIZE, 0:local_size],
+            )
+            T.writes(C[0:WARP_SIZE, 0:local_size_out])
+            tx = T.env_thread("threadIdx.x")
+            T.launch_thread(tx, WARP_SIZE)
+
+            T.evaluate(
+                T.ptx_mma(
+                    mma_prefix,
+                    "row",
+                    "col",
+                    in_dtype_abbrv,
+                    in_dtype_abbrv,
+                    out_dtype_abbrv,
+                    A.data,
+                    A.elem_offset + tx * lift(local_size),
+                    B.data,
+                    B.elem_offset + tx * lift(local_size),
+                    C.data,
+                    C.elem_offset + tx * lift(local_size_out),
+                    False,
+                    dtype=out_dtype,
+                )
+            )
+
+            T.evaluate(
+                T.ptx_mma(
+                    mma_prefix,
+                    "row",
+                    "col",
+                    in_dtype_abbrv,
+                    in_dtype_abbrv,
+                    out_dtype_abbrv,
+                    A.data,
+                    A.elem_offset + tx * lift(local_size),
+                    B.data,
+                    B.elem_offset + tx * lift(local_size) + lift(local_size) // 2,
+                    C.data,
+                    C.elem_offset + tx * lift(local_size_out) + lift(local_size_out) // 2,
+                    False,
+                    dtype=out_dtype,
+                )
+            )
+
+    return mma_sync_desc, mma_sync_impl
+
+
+def get_mma_fill_intrin(dtype, local_size):
+    zero = IntImm("int32", 0).astype(dtype)
+
+    # Assume M = N = 16
+    index_map = shared_16x16_to_ldmatrix_32x8_layout
+
+    @T.prim_func
+    def mma_fill_desc(a: T.handle) -> None:
+        C_warp = T.match_buffer(a, [WARP_SIZE, local_size], dtype=dtype, scope="warp")
+
+        with T.block("root"):
+            T.reads()
+            T.writes(C_warp[0:WARP_SIZE, 0:local_size])
+            for i0, i1 in T.grid(M_DIM, N_DIM):
+                with T.block("C_warp"):
+                    i, j = T.axis.remap("SS", [i0, i1])
+                    thread_id, local_id = index_map(i, j)
+                    T.reads()
+                    T.writes(C_warp[thread_id, local_id])
+                    C_warp[thread_id, local_id] = zero
+
+    @T.prim_func
+    def mma_fill_impl(a: T.handle) -> None:
+        C_warp = T.match_buffer(
+            a, [WARP_SIZE, local_size], dtype=dtype, scope="warp", offset_factor=1
+        )
+
+        with T.block("root"):
+            T.reads()
+            T.writes(C_warp[0:WARP_SIZE, 0:local_size])
+            tx = T.env_thread("threadIdx.x")
+            T.launch_thread(tx, WARP_SIZE)
+
+            T.evaluate(T.mma_fill(local_size, C_warp.data, C_warp.elem_offset, dtype=dtype))
+
+    return mma_fill_desc, mma_fill_impl
+
+
+def get_mma_store_intrin(dtype, local_size, scope="global"):
+    # Assume M = N = 16
+    index_map = shared_16x16_to_ldmatrix_32x8_layout
+
+    @T.prim_func
+    def mma_store_desc(a: T.handle, c: T.handle) -> None:
+        C_warp = T.match_buffer(a, [WARP_SIZE, local_size], dtype=dtype, scope="warp")
+        C = T.match_buffer(c, [M_DIM, N_DIM], dtype=dtype, scope=scope)
+
+        with T.block("root"):
+            T.reads(C_warp[0:WARP_SIZE, 0:local_size])
+            T.writes(C[0:M_DIM, 0:N_DIM])
+            for i0, i1 in T.grid(M_DIM, N_DIM):
+                with T.block("C_warp"):
+                    v0, v1 = T.axis.remap("SS", [i0, i1])
+                    thread_id, local_id = index_map(v0, v1)
+                    T.reads(C_warp[thread_id, local_id])
+                    T.writes(C[v0, v1])
+                    C[v0, v1] = C_warp[thread_id, local_id]
+
+    @T.prim_func
+    def mma_store_impl(a: T.handle, c: T.handle) -> None:
+        s0 = T.var("int32")
+        s1 = T.var("int32")
+
+        C_warp = T.match_buffer(
+            a, [WARP_SIZE, local_size], dtype=dtype, scope="warp", offset_factor=1
+        )
+        C = T.match_buffer(
+            c, [M_DIM, N_DIM], dtype=dtype, scope="global", offset_factor=1, strides=[s0, s1]
+        )
+
+        with T.block("root"):
+            T.reads(C_warp[0:WARP_SIZE, 0:local_size])
+            T.writes(C[0:M_DIM, 0:N_DIM])
+            tx = T.env_thread("threadIdx.x")
+            T.launch_thread(tx, WARP_SIZE)
+
+            T.evaluate(
+                T.mma_store(
+                    M_DIM,
+                    N_DIM,
+                    C.access_ptr("w"),
+                    C_warp.data,
+                    C_warp.elem_offset,
+                    s0,
+                    dtype=dtype,
+                )
+            )
+
+    return mma_store_desc, mma_store_impl
+
+
+LDMATRIX_16x16_A_INTRIN = "mma.ldmatrix_16x16_a"
+TensorIntrin.register(LDMATRIX_16x16_A_INTRIN, *get_ldmatrix_intrin(16, "float16", False, False))
+
+LDMATRIX_16x16_B_INTRIN = "mma.ldmatrix_16x16_b"
+TensorIntrin.register(LDMATRIX_16x16_B_INTRIN, *get_ldmatrix_intrin(16, "float16", True, False))
+
+LDMATRIX_16x16_B_TRANS_INTRIN = "mma.ldmatrix_16x16_b_trans"
+TensorIntrin.register(
+    LDMATRIX_16x16_B_TRANS_INTRIN, *get_ldmatrix_intrin(16, "float16", True, True)
+)
+
+LDMATRIX_16x32_A_INTRIN = "mma.ldmatrix_16x32_a"
+TensorIntrin.register(LDMATRIX_16x32_A_INTRIN, *get_ldmatrix_intrin(32, "int8", False, False))
+
+LDMATRIX_32x16_B_INTRIN = "mma.ldmatrix_32x16_b"
+TensorIntrin.register(LDMATRIX_32x16_B_INTRIN, *get_ldmatrix_intrin(32, "int8", True, False))
+
+LDMATRIX_16x32_B_TRANS_INTRIN = "mma.ldmatrix_16x32_b_trans"
+TensorIntrin.register(LDMATRIX_16x32_B_TRANS_INTRIN, *get_ldmatrix_intrin(32, "int8", True, True))
+
+MMA_f16f16f32_INTRIN = "mma_f16f16f32"
+TensorIntrin.register(MMA_f16f16f32_INTRIN, *get_mma_intrin(16, "float32", False))
+
+MMA_f16f16f32_TRANS_INTRIN = "mma_f16f16f32_trans"
+TensorIntrin.register(MMA_f16f16f32_TRANS_INTRIN, *get_mma_intrin(16, "float32", True))
+
+MMA_f16f16f16_INTRIN = "mma_f16f16f16"
+TensorIntrin.register(MMA_f16f16f16_INTRIN, *get_mma_intrin(16, "float16", False))
+
+MMA_f16f16f16_TRANS_INTRIN = "mma_f16f16f16_trans"
+TensorIntrin.register(MMA_f16f16f16_TRANS_INTRIN, *get_mma_intrin(16, "float16", True))
+
+MMA_i8i8i32_INTRIN = "mma_i8i8i32"
+TensorIntrin.register(MMA_i8i8i32_INTRIN, *get_mma_intrin(32, "int32", False))
+
+MMA_i8i8i32_TRANS_INTRIN = "mma_i8i8i32_trans"
+TensorIntrin.register(MMA_i8i8i32_TRANS_INTRIN, *get_mma_intrin(32, "int32", True))
+
+MMA_fill_16x16_f32_INTRIN = "mma_fill_16x16_f32"
+TensorIntrin.register(MMA_fill_16x16_f32_INTRIN, *get_mma_fill_intrin("float32", 8))
+
+MMA_fill_16x16_f16_INTRIN = "mma_fill_16x16_f16"
+TensorIntrin.register(MMA_fill_16x16_f16_INTRIN, *get_mma_fill_intrin("float16", 8))
+
+MMA_fill_16x16_i32_INTRIN = "mma_fill_16x16_i32"
+TensorIntrin.register(MMA_fill_16x16_i32_INTRIN, *get_mma_fill_intrin("int32", 8))
+
+MMA_store_16x16_f32_global_INTRIN = "mma_store_16x16_f32_global_"
+TensorIntrin.register(
+    MMA_store_16x16_f32_global_INTRIN, *get_mma_store_intrin("float32", 8, "global")
+)
+
+MMA_store_16x16_f16_global_INTRIN = "mma_store_16x16_f16_global_"
+TensorIntrin.register(
+    MMA_store_16x16_f16_global_INTRIN, *get_mma_store_intrin("float16", 8, "global")
+)
+
+MMA_store_16x16_i32_global_INTRIN = "mma_store_16x16_i32_global_"
+TensorIntrin.register(
+    MMA_store_16x16_i32_global_INTRIN, *get_mma_store_intrin("int32", 8, "global")
+)
diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
index 7459d4c250baa..616e75f2e7764 100644
--- a/src/target/source/codegen_cuda.cc
+++ b/src/target/source/codegen_cuda.cc
@@ -25,6 +25,7 @@
 
 #include <tvm/arith/analyzer.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/tir/index_map.h>
 #include <tvm/tir/stmt_functor.h>
 
 #include <cmath>
@@ -818,9 +819,78 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
     std::string local_ptr = this->PrintExpr(op->args[3]);
     std::string local_elem_offset = this->PrintExpr(op->args[4]);
     std::string smem_ptr = this->PrintExpr(op->args[5]);
-    std::string smem_elem_offset = this->PrintExpr(op->args[6]);
-    this->stream << PrintLoadMatrixAssembly(trans, num, type, local_ptr, local_elem_offset,
-                                            smem_ptr, smem_elem_offset);
+    if (trans && op->dtype.bits() == 8) {
+      // Since ldmatrix assumes that a matrix element is 16 bit, it cannot properly transpose an
+      // int8 matrix.
+      std::string smem_stride = this->PrintExpr(op->args[6]);
+      ICHECK(num == 4);
+      os << "for (int i = 0; i < 16; ++i) {\n";
+      os << local_ptr << "[" + local_elem_offset + " + i] = " << smem_ptr
+         << "[(i % 8) / 4 * " + smem_stride + " * 16 + (threadIdx.x % 4) * 4 * " + smem_stride +
+                "+ (i % 4) * " + smem_stride + " + threadIdx.x / 4 +  (i / 8) * 8];\n";
+      os << "}\n";
+    } else {
+      std::string smem_elem_offset = this->PrintExpr(op->args[6]);
+      this->stream << PrintLoadMatrixAssembly(trans, num, type, local_ptr, local_elem_offset,
+                                              smem_ptr, smem_elem_offset);
+    }
+  } else if (op->op.same_as(builtin::mma_store())) {
+    int m = Downcast<Integer>(op->args[0])->value;
+    int n = Downcast<Integer>(op->args[1])->value;
+    std::string dst = this->PrintExpr(op->args[2]);
+    std::string src = this->PrintExpr(op->args[3]);
+    std::string src_offset = this->PrintExpr(op->args[4]);
+    PrimExpr stride = op->args[5];
+
+    ICHECK(m == 16 && n == 16) << "Only m == 16 && n == 16 case supported for now";
+
+    // Each thread in a warp holds a certain number of elements of an MMA output.
+    // For example, if we compute a 16x16 tile using MMA, each thread holds 8 elements
+    // in its registers. So conceptually, a warp memory is organized as a 32x8 block.
+    // A map from a 16x16 tile to a 32x8 block of memory is specified by the index map below.
+
+    // To store the 32x8 output back to a 16x16 tile in shared or global memory, we invert this map
+    // to determine the output location for each 8 element.
+
+    const auto* index_map_func =
+        runtime::Registry::Get("tir.index_map.shared_16x16_to_ldmatrix_32x8_layout");
+    ICHECK(index_map_func);
+
+    auto inverse_index_map =
+        IndexMap::FromFunc(2, *index_map_func).Inverse({Range(0, m), Range(0, n)});
+    auto indices_16x16 = inverse_index_map->final_indices;
+
+    // "//" and "%" in the index map are translated to FloorDiv/Mod, but the plain Div/Mod are fine.
+    // FloorDiv/Mod are supposed to be lowered before they reach codegen, so manually replace them
+    // to the plain ones here.
+    class LowerFloorDivMod : public ExprMutator {
+     public:
+      PrimExpr VisitExpr_(const FloorDivNode* op) {
+        return tir::Div(this->VisitExpr(op->a), this->VisitExpr(op->b));
+      }
+      PrimExpr VisitExpr_(const FloorModNode* op) {
+        return tir::Mod(this->VisitExpr(op->a), this->VisitExpr(op->b));
+      }
+    };
+
+    auto dst_ind = LowerFloorDivMod()(indices_16x16[0] * stride + indices_16x16[1]);
+
+    var_idmap_[inverse_index_map->initial_indices[0].get()] = "threadIdx.x";
+    var_idmap_[inverse_index_map->initial_indices[1].get()] = "local_id";
+
+    os << "for (int local_id = 0; local_id < 8; ++local_id) {\n";
+    os << dst << "[" + this->PrintExpr(dst_ind) + "]"
+       << " = " << src << "[" << src_offset << " + local_id];\n";
+    os << "}\n";
+
+  } else if (op->op.same_as(builtin::mma_fill())) {
+    std::string num_elem = this->PrintExpr(op->args[0]);
+    std::string dst = this->PrintExpr(op->args[1]);
+    std::string dst_offset = this->PrintExpr(op->args[2]);
+
+    os << "for (int i = 0; i < " << num_elem << "; ++i) {\n";
+    os << dst << "[" << dst_offset << " + i] = 0.0;";
+    os << "}\n";
   } else if (op->op.same_as(builtin::ptx_cp_async())) {
     std::string dst = this->PrintExpr(op->args[0]);
     std::string dst_offset = this->PrintExpr(op->args[1]);
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index 0415d1bbec9e8..1871a3d7bf70b 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -256,6 +256,12 @@ TIR_DEFINE_BUILTIN_FUNC(ptx_commit_group)
 TIR_DEFINE_BUILTIN_FUNC(ptx_wait_group)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_BUILTIN_FUNC(mma_store).set_attr<TCallEffectKind>("TCallEffectKind",
+                                                             Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_BUILTIN_FUNC(mma_fill).set_attr<TCallEffectKind>("TCallEffectKind",
+                                                            Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_BUILTIN_FUNC(vectorhigh)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kPure));
 
diff --git a/src/tir/transforms/lower_warp_memory.cc b/src/tir/transforms/lower_warp_memory.cc
index 40971114d416c..d8250cd09888e 100644
--- a/src/tir/transforms/lower_warp_memory.cc
+++ b/src/tir/transforms/lower_warp_memory.cc
@@ -101,7 +101,7 @@ namespace tir {
 
 // Visitor to find m in pattern
 // store warp_mem[m * warp_index + (width * m) * y + x]
-class WarpStoreCoeffFinder : private StmtVisitor {
+class WarpStoreCoeffFinder : private StmtExprVisitor {
  public:
   WarpStoreCoeffFinder(const VarNode* buffer, Var warp_index, arith::Analyzer* analyzer)
       : buffer_(buffer), warp_index_(warp_index), analyzer_(analyzer) {}
@@ -113,6 +113,18 @@ class WarpStoreCoeffFinder : private StmtVisitor {
 
  private:
   /// Visitor implementation
+  void VisitExpr_(const CallNode* op) final {
+    if (op->op.same_as(builtin::ptx_ldmatrix()) && op->args[3].as<VarNode>() == buffer_) {
+      UpdatePattern(op->args[4]);
+    } else if (op->op.same_as(builtin::mma_fill()) && op->args[1].as<VarNode>() == buffer_) {
+      auto* local_size = op->args[0].as<IntImmNode>();
+      ICHECK(local_size) << "Integer expected for the first argument of mma_fill";
+      warp_coeff_ = local_size->value;
+    }
+
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
   void VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
   }
@@ -245,6 +257,37 @@ class WarpAccessRewriter : protected StmtExprMutator {
   }
 
  protected:
+  PrimExpr RewriteIndicesAt(const CallNode* op, const std::vector<int>& indices) {
+    Array<PrimExpr> new_args = op->args;
+    for (int i : indices) {
+      if (op->args[i].get() == buffer_) {
+        PrimExpr local_index = SplitIndexByGroup(op->args[i + 1]).first;
+        new_args.Set(i + 1, local_index);
+      }
+    }
+    return Call(op->dtype, op->op, new_args);
+  }
+
+  PrimExpr VisitExpr_(const CallNode* op) override {
+    if (op->op.same_as(builtin::ptx_mma())) {
+      return RewriteIndicesAt(op, {6, 8, 10});
+    }
+
+    if (op->op.same_as(builtin::ptx_ldmatrix())) {
+      return RewriteIndicesAt(op, {3});
+    }
+
+    if (op->op.same_as(builtin::mma_store())) {
+      return RewriteIndicesAt(op, {3});
+    }
+
+    if (op->op.same_as(builtin::mma_fill())) {
+      return RewriteIndicesAt(op, {1});
+    }
+
+    return StmtExprMutator::VisitExpr_(op);
+  }
+
   PrimExpr VisitExpr_(const VarNode* op) override {
     ICHECK(op != buffer_) << "Cannot access address of warp memory directly";
     return StmtExprMutator::VisitExpr_(op);
diff --git a/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py b/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py
new file mode 100644
index 0000000000000..67e8ae0ad8367
--- /dev/null
+++ b/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py
@@ -0,0 +1,422 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+import tvm
+from tvm import te
+from tvm.tir.tensor_intrin.cuda import (
+    LDMATRIX_16x16_A_INTRIN,
+    LDMATRIX_16x16_B_INTRIN,
+    LDMATRIX_16x16_B_TRANS_INTRIN,
+    LDMATRIX_16x32_A_INTRIN,
+    LDMATRIX_32x16_B_INTRIN,
+    LDMATRIX_16x32_B_TRANS_INTRIN,
+    MMA_f16f16f32_INTRIN,
+    MMA_f16f16f32_TRANS_INTRIN,
+    MMA_f16f16f16_INTRIN,
+    MMA_f16f16f16_TRANS_INTRIN,
+    MMA_i8i8i32_INTRIN,
+    MMA_i8i8i32_TRANS_INTRIN,
+    MMA_fill_16x16_f32_INTRIN,
+    MMA_fill_16x16_f16_INTRIN,
+    MMA_fill_16x16_i32_INTRIN,
+    MMA_store_16x16_f32_global_INTRIN,
+    MMA_store_16x16_f16_global_INTRIN,
+    MMA_store_16x16_i32_global_INTRIN,
+    shared_16x16_to_ldmatrix_32x8_layout,
+    shared_32x16_to_ldmatrix_32x16_layout,
+    shared_16x32_to_ldmatrix_32x16_layout,
+)
+import tvm.testing
+import numpy as np
+
+
+M = 4096
+N = 4096
+K = 4096
+measure_perf = False
+gflops = (N * M * K) * 2 / 1e9
+
+
+def matmul(m, n, k, in_dtype, out_dtype, b_transposed):
+    b_shape = (n, k) if b_transposed else (k, n)
+    a = te.placeholder((m, k), name="A", dtype=in_dtype)
+    b = te.placeholder(b_shape, name="B", dtype=in_dtype)
+    k = te.reduce_axis((0, k), name="k")
+
+    def maybe_cast(v):
+        if in_dtype != out_dtype:
+            return tvm.tir.Cast(out_dtype, v)
+        return v
+
+    def maybe_swap(i, j):
+        if b_transposed:
+            return j, i
+        return i, j
+
+    c = te.compute(
+        (m, n),
+        lambda i, j: te.sum(maybe_cast(a[i, k]) * maybe_cast(b[maybe_swap(k, j)]), axis=[k]),
+        name="C",
+    )
+    return (a, b, c)
+
+
+def is_ampere_or_newer():
+    arch = tvm.contrib.nvcc.get_target_compute_version()
+    major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
+    return major >= 8
+
+
+def run_test(
+    k_inner,
+    in_dtype,
+    out_dtype,
+    b_transposed,
+    i_factors,
+    j_factors,
+    k_factors,
+    index_map_A,
+    index_map_B,
+    index_map_C,
+    ldmatrix_a_intrin,
+    ldmatrix_b_intrin,
+    mma_intrin,
+    mma_fill_intrin,
+    mma_store_intrin,
+):
+    workload = te.create_prim_func(matmul(M, N, K, in_dtype, out_dtype, b_transposed))
+    ir_module = tvm.IRModule({"main": workload})
+    sch = tvm.tir.Schedule(ir_module)
+
+    block = sch.get_block("C")
+    i, j, k = sch.get_loops(block)
+    i, i_tc = sch.split(i, factors=[None, 16])
+    j, j_tc = sch.split(j, factors=[None, 16])
+    k, k_tc = sch.split(k, factors=[None, k_inner])
+
+    sch.reorder(i, j, k, i_tc, j_tc, k_tc)
+
+    block_inner = sch.blockize(i_tc)
+    block_outer, block_inner = block_inner, block
+
+    num_ty = i_factors[2] * j_factors[2]
+
+    i0, i1, i2, i3, i4 = sch.split(i, factors=i_factors)
+    j0, j1, j2, j3, j4 = sch.split(j, factors=j_factors)
+    k0, k1, k2 = sch.split(k, k_factors)
+
+    sch.reorder(i0, j0, i1, j1, j2, i2, k0, k1, i3, j3, k2, i4, j4)
+
+    block_idx = sch.fuse(i0, j0)
+    block_idy = sch.fuse(i1, j1)
+    thread_idy = sch.fuse(j2, i2)
+    sch.bind(block_idx, "blockIdx.x")
+    sch.bind(block_idy, "blockIdx.y")
+    sch.bind(thread_idy, "threadIdx.y")
+
+    def fetch_to_shared(block, idx, ndim):
+        block_read = sch.cache_read(block, idx, "shared")
+        sch.compute_at(block_read, k0)
+        vector_size = 16 if in_dtype == "int8" else 8
+        warp_size = 32
+        fused = sch.fuse(*sch.get_loops(block_read)[-ndim:])
+        _, f_1, f_2, f_3 = sch.split(fused, factors=[None, num_ty, warp_size, vector_size])
+        sch.bind(f_2, "threadIdx.x")
+        sch.bind(f_1, "threadIdx.y")
+        sch.vectorize(f_3)
+        offset = 8 if in_dtype == "float16" else 16
+        sch.storage_align(block_read, 0, axis=-2, factor=32, offset=offset)
+
+        return block_read
+
+    fetch_to_shared(block_outer, 0, 2)
+    fetch_to_shared(block_outer, 1, 2)
+
+    A_warp = sch.cache_read(block_outer, 0, "warp")
+    B_warp = sch.cache_read(block_outer, 1, "warp")
+
+    sch.compute_at(A_warp, k1)
+    sch.compute_at(B_warp, k1)
+
+    C_warp = sch.cache_write(block_outer, 0, "warp")
+    sch.reverse_compute_at(C_warp, thread_idy)
+
+    ii, jj = sch.get_loops(C_warp)[-2:]
+    io, ii = sch.split(ii, factors=[None, 16])
+    jo, ji = sch.split(jj, factors=[None, 16])
+    sch.reorder(io, jo, ii, ji)
+
+    sch.decompose_reduction(block_outer, sch.get_loops(block_outer)[3])
+    block_init_c = sch.get_block("C_init")
+
+    def tile_wmma_fragment(block_read, height, width):
+        i, j = sch.get_loops(block_read)[-2:]
+        i0, i1 = sch.split(i, factors=[None, height])
+        j0, j1 = sch.split(j, factors=[None, width])
+        sch.reorder(i0, j0, i1, j1)
+        return i1
+
+    loop_a = tile_wmma_fragment(A_warp, 16, k_inner)
+
+    if b_transposed:
+        loop_b = tile_wmma_fragment(B_warp, 16, k_inner)
+    else:
+        loop_b = tile_wmma_fragment(B_warp, k_inner, 16)
+
+    sch.transform_layout(A_warp, 0, "write", index_map_A)
+    sch.transform_layout(B_warp, 0, "write", index_map_B)
+    sch.transform_layout(C_warp, 0, "read", index_map_C)
+
+    sch.tensorize(loop_a, ldmatrix_a_intrin)
+    sch.tensorize(loop_b, ldmatrix_b_intrin)
+    sch.tensorize(sch.get_loops(block_inner)[-3], mma_intrin)
+    sch.tensorize(sch.get_loops(block_init_c)[-2], mma_fill_intrin)
+    sch.tensorize(sch.get_loops(C_warp)[-2], mma_store_intrin)
+
+    if not is_ampere_or_newer():
+        return None
+
+    f = tvm.build(sch.mod["main"], target="cuda", name="dense")
+
+    dev = tvm.device("cuda", 0)
+
+    if in_dtype == "float16":
+        a_np = np.random.uniform(size=(M, K)).astype("float16")
+
+        if b_transposed:
+            b_np = np.random.uniform(size=(N, K)).astype("float16")
+            c_np = np.dot(a_np.astype("float32"), b_np.astype("float32").transpose()).astype(
+                out_dtype
+            )
+        else:
+            b_np = np.random.uniform(size=(K, N)).astype("float16")
+            c_np = np.dot(a_np.astype("float32"), b_np.astype("float32")).astype(out_dtype)
+    else:
+        a_np = np.random.randint(-128, 128, (M, K)).astype("int8")
+
+        if b_transposed:
+            b_np = np.random.randint(-128, 128, (N, K)).astype("int8")
+            c_np = np.dot(a_np.astype("float32"), b_np.astype("float32").transpose()).astype(
+                "int32"
+            )
+        else:
+            b_np = np.random.randint(-128, 128, (K, N)).astype("int8")
+            c_np = np.dot(a_np.astype("float32"), b_np.astype("float32")).astype("int32")
+
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(b_np, dev)
+    c = tvm.nd.array(np.zeros((M, N), dtype=out_dtype), dev)
+
+    f(a, b, c)
+
+    if out_dtype != "float16":
+        # The numpy reference is computed with fp32 precision (otherwise too slow).
+        # So there is non-trivial accuracy difference if TVM result is computed with fp16 accumulation.
+        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-3)
+
+    return lambda: f.time_evaluator(f.entry_name, dev, number=500)(a, b, c)
+
+
+@tvm.testing.requires_cuda
+def test_f16f16f32_m16n16k16():
+    def index_map(i, j):
+        return (
+            i // 16,
+            j // 16,
+            *shared_16x16_to_ldmatrix_32x8_layout(i % 16, j % 16),
+        )
+
+    k_inner = 16
+    in_dtype = "float16"
+    out_dtype = "float32"
+    i_factors, j_factors, k_factors = [4, 8, 2, 4, 1], [1, 64, 2, 1, 2], [128, 2, 1]
+
+    timer = run_test(
+        k_inner,
+        in_dtype,
+        out_dtype,
+        False,  # b_transposed
+        i_factors,
+        j_factors,
+        k_factors,
+        index_map,
+        index_map,
+        index_map,
+        LDMATRIX_16x16_A_INTRIN,
+        LDMATRIX_16x16_B_INTRIN,
+        MMA_f16f16f32_INTRIN,
+        MMA_fill_16x16_f32_INTRIN,
+        MMA_store_16x16_f32_global_INTRIN,
+    )
+
+    if measure_perf and timer:
+        print("f16f16f32_m16n16k16: %f GFLOPS" % (gflops / (timer().mean)))
+
+    timer = run_test(
+        k_inner,
+        in_dtype,
+        out_dtype,
+        True,  # b_transposed
+        i_factors,
+        j_factors,
+        k_factors,
+        index_map,
+        index_map,
+        index_map,
+        LDMATRIX_16x16_A_INTRIN,
+        LDMATRIX_16x16_B_TRANS_INTRIN,
+        MMA_f16f16f32_TRANS_INTRIN,
+        MMA_fill_16x16_f32_INTRIN,
+        MMA_store_16x16_f32_global_INTRIN,
+    )
+
+    if measure_perf and timer:
+        print("f16f16f32_m16n16k16_trans: %f GFLOPS" % (gflops / (timer().mean)))
+
+
+@tvm.testing.requires_cuda
+def test_f16f16f16_m16n16k16():
+    def index_map(i, j):
+        return (
+            i // 16,
+            j // 16,
+            *shared_16x16_to_ldmatrix_32x8_layout(i % 16, j % 16),
+        )
+
+    k_inner = 16
+    in_dtype = "float16"
+    out_dtype = "float16"
+    i_factors, j_factors, k_factors = [16, 2, 1, 4, 2], [16, 2, 2, 1, 4], [128, 2, 1]
+
+    timer = run_test(
+        k_inner,
+        in_dtype,
+        out_dtype,
+        False,  # b_transposed
+        i_factors,
+        j_factors,
+        k_factors,
+        index_map,
+        index_map,
+        index_map,
+        LDMATRIX_16x16_A_INTRIN,
+        LDMATRIX_16x16_B_INTRIN,
+        MMA_f16f16f16_INTRIN,
+        MMA_fill_16x16_f16_INTRIN,
+        MMA_store_16x16_f16_global_INTRIN,
+    )
+
+    if measure_perf and timer:
+        print("f16f16f16_m16n16k16: %f GFLOPS" % (gflops / (timer().mean)))
+
+    timer = run_test(
+        k_inner,
+        in_dtype,
+        out_dtype,
+        True,  # b_transposed
+        i_factors,
+        j_factors,
+        k_factors,
+        index_map,
+        index_map,
+        index_map,
+        LDMATRIX_16x16_A_INTRIN,
+        LDMATRIX_16x16_B_TRANS_INTRIN,
+        MMA_f16f16f16_TRANS_INTRIN,
+        MMA_fill_16x16_f16_INTRIN,
+        MMA_store_16x16_f16_global_INTRIN,
+    )
+
+    if measure_perf and timer:
+        print("f16f16f16_m16n16k16_trans: %f GFLOPS" % (gflops / (timer().mean)))
+
+
+@tvm.testing.requires_cuda
+def test_i8i8i32_m16n16k32():
+    def index_map_A(i, j):
+        return (
+            i // 16,
+            j // 32,
+            *shared_16x32_to_ldmatrix_32x16_layout(i % 16, j % 32),
+        )
+
+    def index_map_B(i, j):
+        return (
+            i // 32,
+            j // 16,
+            *shared_32x16_to_ldmatrix_32x16_layout(i % 32, j % 16),
+        )
+
+    def index_map_C(i, j):
+        return (
+            i // 16,
+            j // 16,
+            *shared_16x16_to_ldmatrix_32x8_layout(i % 16, j % 16),
+        )
+
+    k_inner = 32
+    in_dtype = "int8"
+    out_dtype = "int32"
+    i_factors, j_factors, k_factors = [1, 32, 1, 4, 2], [8, 4, 4, 2, 1], [32, 2, 2]
+
+    timer = run_test(
+        k_inner,
+        in_dtype,
+        out_dtype,
+        False,  # b_transposed
+        i_factors,
+        j_factors,
+        k_factors,
+        index_map_A,
+        index_map_B,
+        index_map_C,
+        LDMATRIX_16x32_A_INTRIN,
+        LDMATRIX_32x16_B_INTRIN,
+        MMA_i8i8i32_INTRIN,
+        MMA_fill_16x16_i32_INTRIN,
+        MMA_store_16x16_i32_global_INTRIN,
+    )
+
+    if measure_perf and timer:
+        print("i8i8i32_m16n16k32: %f GOPS" % (gflops / (timer().mean)))
+
+    timer = run_test(
+        k_inner,
+        in_dtype,
+        out_dtype,
+        True,  # b_transposed
+        i_factors,
+        j_factors,
+        k_factors,
+        index_map_A,
+        index_map_A,
+        index_map_C,
+        LDMATRIX_16x32_A_INTRIN,
+        LDMATRIX_16x32_B_TRANS_INTRIN,
+        MMA_i8i8i32_TRANS_INTRIN,
+        MMA_fill_16x16_i32_INTRIN,
+        MMA_store_16x16_i32_global_INTRIN,
+    )
+
+    if measure_perf and timer:
+        print("i8i8i32_m16n16k32_trans: %f GOPS" % (gflops / (timer().mean)))
+
+
+if __name__ == "__main__":
+    test_f16f16f32_m16n16k16()
+    test_f16f16f16_m16n16k16()
+    test_i8i8i32_m16n16k32()

From 85e42b6af38ea3bd0c99c8208d7baed5086a8959 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 20 May 2022 15:09:19 -0500
Subject: [PATCH 51/59] [skip ci] Fix scipy intersphinx link (#11399)

Follow-up from https://github.com/apache/tvm/pull/10181, as the URL
has changed again in https://github.com/scipy/scipy/pull/16221.  From
[this
comment](https://github.com/scipy/scipy/issues/14267#issuecomment-1034196161),
the `html-scipyorg` portion wasn't intended to be part of the URL.

This should resolve the HTTP 404 occurring in `Docs: GPU`
step (e.g. [here](https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/PR-11269/13/pipeline/405#step-975-log-73)),
by accessing `https://docs.scipy.org/doc/scipy-1.8.0/objects.inv`
instead of
`https://docs.scipy.org/doc/scipy-1.8.0/html-scipyorg/objects.inv`
---
 docs/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/conf.py b/docs/conf.py
index da31c3a4243c9..400d959bade6c 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -203,7 +203,7 @@ def git_describe_version(original_version):
 intersphinx_mapping = {
     "python": ("https://docs.python.org/{.major}".format(sys.version_info), None),
     # "numpy": ("https://numpy.org/doc/stable", None),
-    "scipy": ("https://docs.scipy.org/doc/scipy-1.8.0/html-scipyorg/", None),
+    "scipy": ("https://docs.scipy.org/doc/scipy-1.8.0/", None),
     # "matplotlib": ("https://matplotlib.org/", None),
 }
 

From 50997035befc0383dcba21808ab739d9ed8df08c Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 20 May 2022 16:09:01 -0700
Subject: [PATCH 52/59] [ci] Restructure Jenkinsfile (#11380)

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .gitattributes                |   2 +
 Jenkinsfile                   | 278 +++++-------
 jenkins/Build.groovy.j2       | 186 ++++++++
 jenkins/Deploy.groovy.j2      |  71 +++
 jenkins/DockerBuild.groovy.j2 | 158 +++++++
 jenkins/Jenkinsfile.j2        | 812 +---------------------------------
 jenkins/Lint.groovy.j2        |  18 +
 jenkins/Prepare.groovy.j2     | 133 ++++++
 jenkins/README.md             |  28 ++
 jenkins/Test.groovy.j2        | 236 ++++++++++
 tests/lint/rat-excludes       |   8 +
 11 files changed, 977 insertions(+), 953 deletions(-)
 create mode 100644 .gitattributes
 create mode 100644 jenkins/Build.groovy.j2
 create mode 100644 jenkins/Deploy.groovy.j2
 create mode 100644 jenkins/DockerBuild.groovy.j2
 create mode 100644 jenkins/Lint.groovy.j2
 create mode 100644 jenkins/Prepare.groovy.j2
 create mode 100644 jenkins/README.md
 create mode 100644 jenkins/Test.groovy.j2

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000..29e2373f30ff8
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+Jenkinsfile linguist-generated=true
+
diff --git a/Jenkinsfile b/Jenkinsfile
index 7b8c8f890db15..0b64f9306844d 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-20T18:06:10.772162
+// Generated at 2022-05-20T13:24:01.371704
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -86,6 +86,20 @@ docker_build = 'docker/build.sh'
 max_time = 180
 rebuild_docker_images = false
 
+// skips builds from branch indexing; sourced from https://www.jvt.me/posts/2020/02/23/jenkins-multibranch-skip-branch-index/
+// execute this before anything else, including requesting any time on an agent
+if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) {
+  print "INFO: Build skipped due to trigger being Branch Indexing"
+  currentBuild.result = 'ABORTED' // optional, gives a better hint to the user that it's been skipped, rather than the default which shows it's successful
+  return
+}
+
+// Filenames for stashing between build and test steps
+s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
+
+// General note: Jenkins has limits on the size of a method (or top level code)
+// that are pretty strict, so most usage of groovy methods in these templates
+// are purely to satisfy the JVM
 def per_exec_ws(folder) {
   return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder
 }
@@ -183,146 +197,52 @@ def should_skip_ci(pr_number) {
   return git_skip_ci_code == 0
 }
 
-// skips builds from branch indexing; sourced from https://www.jvt.me/posts/2020/02/23/jenkins-multibranch-skip-branch-index/
-// execute this before anything else, including requesting any time on an agent
-if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) {
-  print "INFO: Build skipped due to trigger being Branch Indexing"
-  currentBuild.result = 'ABORTED' // optional, gives a better hint to the user that it's been skipped, rather than the default which shows it's successful
-  return
-}
-
-cancel_previous_build()
-
-def lint() {
-stage('Lint') {
-  parallel(
-  'Lint 1 of 2': {
+def prepare() {
+  stage('Prepare') {
     node('CPU-SMALL') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/prepare") {
         init_git()
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'TVM_NUM_SHARDS=2',
-            'TVM_SHARD_INDEX=0'], {
-            ci_arm = params.ci_arm_param ?: ci_arm
-            ci_cpu = params.ci_cpu_param ?: ci_cpu
-            ci_gpu = params.ci_gpu_param ?: ci_gpu
-            ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
-            ci_i386 = params.ci_i386_param ?: ci_i386
-            ci_lint = params.ci_lint_param ?: ci_lint
-            ci_qemu = params.ci_qemu_param ?: ci_qemu
-            ci_wasm = params.ci_wasm_param ?: ci_wasm
-
-            sh (script: """
-              echo "Docker images being used in this build:"
-              echo " ci_arm = ${ci_arm}"
-              echo " ci_cpu = ${ci_cpu}"
-              echo " ci_gpu = ${ci_gpu}"
-              echo " ci_hexagon = ${ci_hexagon}"
-              echo " ci_i386 = ${ci_i386}"
-              echo " ci_lint = ${ci_lint}"
-              echo " ci_qemu = ${ci_qemu}"
-              echo " ci_wasm = ${ci_wasm}"
-            """, label: 'Docker image names')
-
-            is_docs_only_build = sh (
-              returnStatus: true,
-              script: './tests/scripts/git_change_docs.sh',
-              label: 'Check for docs only changes',
-            )
-            skip_ci = should_skip_ci(env.CHANGE_ID)
-            skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
-            rebuild_docker_images = sh (
-              returnStatus: true,
-              script: './tests/scripts/git_change_docker.sh',
-              label: 'Check for any docker changes',
-            )
-            if (skip_ci) {
-              // Don't rebuild when skipping CI
-              rebuild_docker_images = false
-            }
-            if (rebuild_docker_images) {
-              // Exit before linting so we can use the newly created Docker images
-              // to run the lint
-              return
-            }
-            sh (
-              script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh",
-              label: 'Run lint',
-            )
-          })
-        }
-      }
-    }
-  },
-  'Lint 2 of 2': {
-    node('CPU-SMALL') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") {
-        init_git()
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'TVM_NUM_SHARDS=2',
-            'TVM_SHARD_INDEX=1'], {
-            ci_arm = params.ci_arm_param ?: ci_arm
-            ci_cpu = params.ci_cpu_param ?: ci_cpu
-            ci_gpu = params.ci_gpu_param ?: ci_gpu
-            ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
-            ci_i386 = params.ci_i386_param ?: ci_i386
-            ci_lint = params.ci_lint_param ?: ci_lint
-            ci_qemu = params.ci_qemu_param ?: ci_qemu
-            ci_wasm = params.ci_wasm_param ?: ci_wasm
-
-            sh (script: """
-              echo "Docker images being used in this build:"
-              echo " ci_arm = ${ci_arm}"
-              echo " ci_cpu = ${ci_cpu}"
-              echo " ci_gpu = ${ci_gpu}"
-              echo " ci_hexagon = ${ci_hexagon}"
-              echo " ci_i386 = ${ci_i386}"
-              echo " ci_lint = ${ci_lint}"
-              echo " ci_qemu = ${ci_qemu}"
-              echo " ci_wasm = ${ci_wasm}"
-            """, label: 'Docker image names')
-
-            is_docs_only_build = sh (
-              returnStatus: true,
-              script: './tests/scripts/git_change_docs.sh',
-              label: 'Check for docs only changes',
-            )
-            skip_ci = should_skip_ci(env.CHANGE_ID)
-            skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
-            rebuild_docker_images = sh (
-              returnStatus: true,
-              script: './tests/scripts/git_change_docker.sh',
-              label: 'Check for any docker changes',
-            )
-            if (skip_ci) {
-              // Don't rebuild when skipping CI
-              rebuild_docker_images = false
-            }
-            if (rebuild_docker_images) {
-              // Exit before linting so we can use the newly created Docker images
-              // to run the lint
-              return
-            }
-            sh (
-              script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh",
-              label: 'Run lint',
-            )
-          })
+        ci_arm = params.ci_arm_param ?: ci_arm
+        ci_cpu = params.ci_cpu_param ?: ci_cpu
+        ci_gpu = params.ci_gpu_param ?: ci_gpu
+        ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
+        ci_i386 = params.ci_i386_param ?: ci_i386
+        ci_lint = params.ci_lint_param ?: ci_lint
+        ci_qemu = params.ci_qemu_param ?: ci_qemu
+        ci_wasm = params.ci_wasm_param ?: ci_wasm
+
+        sh (script: """
+          echo "Docker images being used in this build:"
+          echo " ci_arm = ${ci_arm}"
+          echo " ci_cpu = ${ci_cpu}"
+          echo " ci_gpu = ${ci_gpu}"
+          echo " ci_hexagon = ${ci_hexagon}"
+          echo " ci_i386 = ${ci_i386}"
+          echo " ci_lint = ${ci_lint}"
+          echo " ci_qemu = ${ci_qemu}"
+          echo " ci_wasm = ${ci_wasm}"
+        """, label: 'Docker image names')
+
+        is_docs_only_build = sh (
+          returnStatus: true,
+          script: './tests/scripts/git_change_docs.sh',
+          label: 'Check for docs only changes',
+        )
+        skip_ci = should_skip_ci(env.CHANGE_ID)
+        skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
+        rebuild_docker_images = sh (
+          returnStatus: true,
+          script: './tests/scripts/git_change_docker.sh',
+          label: 'Check for any docker changes',
+        )
+        if (skip_ci) {
+          // Don't rebuild when skipping CI
+          rebuild_docker_images = false
         }
       }
     }
-  },
-  )
-}
+  }
 }
-
-// [note: method size]
-// This has to be extracted into a method due to JVM limitations on the size of
-// a method (so the code can't all be inlined)
-lint()
-
 def build_image(image_name) {
   hash = sh(
     returnStdout: true,
@@ -378,7 +298,7 @@ def build_image(image_name) {
   )
 }
 
-if (rebuild_docker_images) {
+def build_docker_images() {
   stage('Docker Image Build') {
     // TODO in a follow up PR: Find ecr tag and use in subsequent builds
     parallel 'ci-lint': {
@@ -481,11 +401,46 @@ def make(docker_type, path, make_flag) {
     }
   }
 }
-
-// Filenames for stashing between build and test steps
-s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
-
-
+def lint() {
+  stage('Lint') {
+    parallel(
+  'Lint 1 of 2': {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") {
+        init_git()
+        timeout(time: max_time, unit: 'MINUTES') {
+          withEnv([
+            'TVM_NUM_SHARDS=2',
+            'TVM_SHARD_INDEX=0'], {
+            sh (
+                script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh",
+                label: 'Run lint',
+              )
+          })
+        }
+      }
+    }
+  },
+  'Lint 2 of 2': {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") {
+        init_git()
+        timeout(time: max_time, unit: 'MINUTES') {
+          withEnv([
+            'TVM_NUM_SHARDS=2',
+            'TVM_SHARD_INDEX=1'], {
+            sh (
+                script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh",
+                label: 'Run lint',
+              )
+          })
+        }
+      }
+    }
+  },
+    )
+  }
+}
 def ci_setup(image) {
   sh (
     script: "${docker_run} ${image} ./tests/scripts/task_ci_setup.sh",
@@ -529,7 +484,6 @@ def add_microtvm_permissions() {
   )
 }
 
-
 def build() {
 stage('Build') {
   environment {
@@ -771,10 +725,6 @@ stage('Build') {
   )
 }
 }
-
-// [note: method size]
-build()
-
 def test() {
 stage('Test') {
   environment {
@@ -1845,10 +1795,6 @@ stage('Test') {
   )
 }
 }
-
-// [note: method size]
-test()
-
 /*
 stage('Build packages') {
   parallel 'conda CPU': {
@@ -1907,11 +1853,13 @@ def deploy_docs() {
   }
 }
 
-stage('Deploy') {
-  if (env.BRANCH_NAME == 'main' && env.DOCS_DEPLOY_ENABLED == 'yes') {
-    node('CPU') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/deploy-docs") {
-        sh(
+
+def deploy() {
+  stage('Deploy') {
+    if (env.BRANCH_NAME == 'main' && env.DOCS_DEPLOY_ENABLED == 'yes') {
+      node('CPU') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/deploy-docs") {
+          sh(
             script: """
               set -eux
               aws s3 cp --no-progress s3://${s3_prefix}/docs/docs.tgz docs.tgz
@@ -1920,8 +1868,26 @@ stage('Deploy') {
             label: 'Download artifacts from S3',
           )
 
-        deploy_docs()
+          deploy_docs()
+        }
       }
     }
   }
 }
+
+
+cancel_previous_build()
+
+prepare()
+
+if (rebuild_docker_images) {
+  build_docker_images()
+}
+
+lint()
+
+build()
+
+test()
+
+deploy()
diff --git a/jenkins/Build.groovy.j2 b/jenkins/Build.groovy.j2
new file mode 100644
index 0000000000000..c1715949175bc
--- /dev/null
+++ b/jenkins/Build.groovy.j2
@@ -0,0 +1,186 @@
+def ci_setup(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_ci_setup.sh",
+    label: 'Set up CI environment',
+  )
+}
+
+def python_unittest(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_unittest.sh",
+    label: 'Run Python unit tests',
+  )
+}
+
+def fsim_test(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
+    label: 'Run VTA tests in FSIM',
+  )
+}
+
+def cmake_build(image, path, make_flag) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
+    label: 'Run cmake build',
+  )
+}
+
+def cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh",
+    label: 'Build and run C++ tests',
+  )
+}
+
+
+def add_microtvm_permissions() {
+  {% for folder in microtvm_template_projects %}
+  sh(
+    script: 'find {{ folder }} -type f | grep qemu-hack | xargs chmod +x',
+    label: 'Add execute permissions for microTVM files',
+  )
+  {% endfor %}
+}
+
+def build() {
+stage('Build') {
+  environment {
+    SKIP_SLOW_TESTS = "${skip_slow_tests}"
+  }
+  parallel(
+    'BUILD: GPU': {
+    if (!skip_ci) {
+      node('CPU-SMALL') {
+        ws({{ m.per_exec_ws('tvm/build-gpu') }}) {
+          init_git()
+          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
+          make("${ci_gpu} --no-gpu", 'build', '-j2')
+          {{ m.upload_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
+
+          // compiler test
+          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build2"
+          make("${ci_gpu} --no-gpu", 'build2', '-j2')
+          {{ m.upload_artifacts(tag='gpu2', filenames=tvm_multilib) }}
+        }
+      }
+    }
+  },
+  'BUILD: CPU': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU-SMALL') {
+        ws({{ m.per_exec_ws('tvm/build-cpu') }}) {
+          init_git()
+          sh (
+            script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
+            label: 'Create CPU cmake config',
+          )
+          make(ci_cpu, 'build', '-j2')
+          {{ m.upload_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
+          timeout(time: max_time, unit: 'MINUTES') {
+            ci_setup(ci_cpu)
+            // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
+            // TODO(@jroesch): need to resolve CI issue will turn back on in follow up patch
+            sh (script: "${docker_run} ${ci_cpu} ./tests/scripts/task_rust.sh", label: 'Rust build and test')
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('BUILD: CPU')
+    }
+  },
+  'BUILD: WASM': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU-SMALL') {
+        ws({{ m.per_exec_ws('tvm/build-wasm') }}) {
+          init_git()
+          sh (
+            script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
+            label: 'Create WASM cmake config',
+          )
+          make(ci_wasm, 'build', '-j2')
+          cpp_unittest(ci_wasm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            ci_setup(ci_wasm)
+            sh (
+              script: "${docker_run} ${ci_wasm} ./tests/scripts/task_web_wasm.sh",
+              label: 'Run WASM lint and tests',
+            )
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('BUILD: WASM')
+    }
+  },
+  'BUILD: i386': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU-SMALL') {
+        ws({{ m.per_exec_ws('tvm/build-i386') }}) {
+          init_git()
+          sh (
+            script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
+            label: 'Create i386 cmake config',
+          )
+          make(ci_i386, 'build', '-j2')
+          {{ m.upload_artifacts(tag='i386', filenames=tvm_multilib_tsim) }}
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('BUILD: i386')
+    }
+  },
+  'BUILD: arm': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('ARM') {
+        ws({{ m.per_exec_ws('tvm/build-arm') }}) {
+          init_git()
+          sh (
+            script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
+            label: 'Create ARM cmake config',
+          )
+          make(ci_arm, 'build', '-j4')
+          {{ m.upload_artifacts(tag='arm', filenames=tvm_multilib) }}
+        }
+      }
+     } else {
+      Utils.markStageSkippedForConditional('BUILD: arm')
+    }
+  },
+  'BUILD: QEMU': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU-SMALL') {
+        ws({{ m.per_exec_ws('tvm/build-qemu') }}) {
+          init_git()
+          sh (
+            script: "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh build",
+            label: 'Create QEMU cmake config',
+          )
+          make(ci_qemu, 'build', '-j2')
+          {{ m.upload_artifacts(tag='qemu', filenames=tvm_lib, folders=microtvm_template_projects) }}
+        }
+      }
+     } else {
+      Utils.markStageSkippedForConditional('BUILD: QEMU')
+    }
+  },
+  'BUILD: Hexagon': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU-SMALL') {
+        ws({{ m.per_exec_ws('tvm/build-hexagon') }}) {
+          init_git()
+          sh (
+            script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
+            label: 'Create Hexagon cmake config',
+          )
+          make(ci_hexagon, 'build', '-j2')
+          {{ m.upload_artifacts(tag='hexagon', filenames=tvm_lib) }}
+        }
+      }
+     } else {
+      Utils.markStageSkippedForConditional('BUILD: Hexagon')
+    }
+  },
+  )
+}
+}
diff --git a/jenkins/Deploy.groovy.j2 b/jenkins/Deploy.groovy.j2
new file mode 100644
index 0000000000000..917f71ded1ff3
--- /dev/null
+++ b/jenkins/Deploy.groovy.j2
@@ -0,0 +1,71 @@
+/*
+stage('Build packages') {
+  parallel 'conda CPU': {
+    node('CPU') {
+      sh "${docker_run} tlcpack/conda-cpu ./conda/build_cpu.sh
+    }
+  },
+  'conda cuda': {
+    node('CPU') {
+      sh "${docker_run} tlcpack/conda-cuda90 ./conda/build_cuda.sh
+      sh "${docker_run} tlcpack/conda-cuda100 ./conda/build_cuda.sh
+    }
+  }
+// Here we could upload the packages to anaconda for releases
+// and/or the main branch
+}
+*/
+
+def deploy_docs() {
+  // Note: This code must stay in the Jenkinsfile to ensure that it runs
+  // from a trusted context only
+  sh(
+    script: '''
+      set -eux
+      rm -rf tvm-site
+      git clone -b $DOCS_DEPLOY_BRANCH --depth=1 https://github.com/apache/tvm-site
+      cd tvm-site
+      git status
+      git checkout -B $DOCS_DEPLOY_BRANCH
+
+      rm -rf docs
+      mkdir -p docs
+      tar xf ../docs.tgz -C docs
+      COMMIT=$(cat docs/commit_hash)
+      git add .
+      git config user.name tvm-bot
+      git config user.email 95660001+tvm-bot@users.noreply.github.com
+      git commit -m"deploying docs (apache/tvm@$COMMIT)"
+      git status
+    ''',
+    label: 'Unpack docs and update tvm-site'
+  )
+
+  withCredentials([string(
+    credentialsId: 'docs-push-token',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    sh(
+      script: '''
+        cd tvm-site
+        git remote add deploy https://$GITHUB_TOKEN:x-oauth-basic@github.com/apache/tvm-site.git
+        git push deploy $DOCS_DEPLOY_BRANCH
+      ''',
+      label: 'Upload docs to apache/tvm-site'
+    )
+  }
+}
+
+
+def deploy() {
+  stage('Deploy') {
+    if (env.BRANCH_NAME == 'main' && env.DOCS_DEPLOY_ENABLED == 'yes') {
+      node('CPU') {
+        ws({{ m.per_exec_ws('tvm/deploy-docs') }}) {
+          {{ m.download_artifacts(tag='docs', filenames=["docs.tgz"]) }}
+          deploy_docs()
+        }
+      }
+    }
+  }
+}
diff --git a/jenkins/DockerBuild.groovy.j2 b/jenkins/DockerBuild.groovy.j2
new file mode 100644
index 0000000000000..84bb8e3e376d1
--- /dev/null
+++ b/jenkins/DockerBuild.groovy.j2
@@ -0,0 +1,158 @@
+def build_image(image_name) {
+  hash = sh(
+    returnStdout: true,
+    script: 'git log -1 --format=\'%h\''
+  ).trim()
+  def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}-${env.BUILD_NUMBER}"
+  sh(
+    script: "${docker_build} ${image_name} --spec ${full_name}",
+    label: 'Build docker image'
+  )
+  aws_account_id = sh(
+    returnStdout: true,
+    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
+    label: 'Get AWS ID'
+  ).trim()
+
+  try {
+    // Use a credential so Jenkins knows to scrub the AWS account ID which is nice
+    // (but so we don't have to rely it being hardcoded in Jenkins)
+    withCredentials([string(
+      credentialsId: 'aws-account-id',
+      variable: '_ACCOUNT_ID_DO_NOT_USE',
+      )]) {
+      withEnv([
+        "AWS_ACCOUNT_ID=${aws_account_id}",
+        'AWS_DEFAULT_REGION=us-west-2']) {
+        sh(
+          script: '''
+            set -x
+            aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com
+          ''',
+          label: 'Log in to ECR'
+        )
+        sh(
+          script: """
+            set -x
+            docker tag ${full_name} \$AWS_ACCOUNT_ID.dkr.ecr.\$AWS_DEFAULT_REGION.amazonaws.com/${full_name}
+            docker push \$AWS_ACCOUNT_ID.dkr.ecr.\$AWS_DEFAULT_REGION.amazonaws.com/${full_name}
+          """,
+          label: 'Upload image to ECR'
+        )
+      }
+    }
+  } finally {
+    sh(
+      script: 'rm -f ~/.docker/config.json',
+      label: 'Clean up login credentials'
+    )
+  }
+  sh(
+    script: "docker rmi ${full_name}",
+    label: 'Remove docker image'
+  )
+}
+
+def build_docker_images() {
+  stage('Docker Image Build') {
+    // TODO in a follow up PR: Find ecr tag and use in subsequent builds
+    parallel 'ci-lint': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_lint')
+        }
+      }
+    }, 'ci-cpu': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_cpu')
+        }
+      }
+    }, 'ci-gpu': {
+      node('GPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_gpu')
+        }
+      }
+    }, 'ci-qemu': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_qemu')
+        }
+      }
+    }, 'ci-i386': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_i386')
+        }
+      }
+    }, 'ci-arm': {
+      node('ARM') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_arm')
+        }
+      }
+    }, 'ci-wasm': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_wasm')
+        }
+      }
+    }, 'ci-hexagon': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_hexagon')
+        }
+      }
+    }
+  }
+  // // TODO: Once we are able to use the built images, enable this step
+  // // If the docker images changed, we need to run the image build before the lint
+  // // can run since it requires a base docker image. Most of the time the images
+  // // aren't build though so it's faster to use the same node that checks for
+  // // docker changes to run the lint in the usual case.
+  // stage('Sanity Check (re-run)') {
+  //   timeout(time: max_time, unit: 'MINUTES') {
+  //     node('CPU') {
+  //       ws({{ m.per_exec_ws('tvm/sanity') }}) {
+  //         init_git()
+  //         sh (
+  //           script: "${docker_run} ${ci_lint}  ./tests/scripts/task_lint.sh",
+  //           label: 'Run lint',
+  //         )
+  //       }
+  //     }
+  //   }
+  // }
+}
+
+// Run make. First try to do an incremental make from a previous workspace in hope to
+// accelerate the compilation. If something is wrong, clean the workspace and then
+// build from scratch.
+def make(docker_type, path, make_flag) {
+  timeout(time: max_time, unit: 'MINUTES') {
+    try {
+      cmake_build(docker_type, path, make_flag)
+      // always run cpp test when build
+    } catch (hudson.AbortException ae) {
+      // script exited due to user abort, directly throw instead of retry
+      if (ae.getMessage().contains('script returned exit code 143')) {
+        throw ae
+      }
+      echo 'Incremental compilation failed. Fall back to build from scratch'
+      sh (
+        script: "${docker_run} ${docker_type} ./tests/scripts/task_clean.sh ${path}",
+        label: 'Clear old cmake workspace',
+      )
+      cmake_build(docker_type, path, make_flag)
+    }
+  }
+}
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index b00ee0272626c..a1127ec6a8d5b 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -83,103 +83,6 @@ docker_build = 'docker/build.sh'
 max_time = 180
 rebuild_docker_images = false
 
-def per_exec_ws(folder) {
-  return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder
-}
-
-// initialize source codes
-def init_git() {
-  checkout scm
-
-  // Clear out all Docker images that aren't going to be used
-  sh(
-    script: "docker image ls --all --format {% raw %}'{{.Repository}}:{{.Tag}}  {{.ID}}'{% endraw %} | { grep -vE '{% for image in images %}{% raw %}${{% endraw %}{{ image.name }}{% raw %}}{% endraw %}{% if not loop.last %}|{% endif %}{% endfor %}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }",
-    label: 'Clean old Docker images',
-  )
-  // Add more info about job node
-  sh (
-    script: './tests/scripts/task_show_node_info.sh',
-    label: 'Show executor node info',
-  )
-
-  // Determine merge commit to use for all stages
-  sh (
-    script: 'git fetch origin main',
-    label: 'Fetch upstream',
-  )
-  if (upstream_revision == null) {
-    upstream_revision = sh(
-      script: 'git log -1 FETCH_HEAD --format=\'%H\'',
-      label: 'Determine upstream revision',
-      returnStdout: true,
-    ).trim()
-  }
-  sh (
-    script: "git -c user.name=TVM-Jenkins -c user.email=jenkins@tvm.apache.org merge ${upstream_revision}",
-    label: 'Merge to origin/main'
-  )
-
-  retry(5) {
-    timeout(time: 2, unit: 'MINUTES') {
-      sh (script: 'git submodule update --init -f', label: 'Update git submodules')
-    }
-  }
-}
-
-def should_skip_slow_tests(pr_number) {
-  withCredentials([string(
-    credentialsId: 'tvm-bot-jenkins-reader',
-    variable: 'GITHUB_TOKEN',
-  )]) {
-    // Exit code of 1 means run slow tests, exit code of 0 means skip slow tests
-    result = sh (
-      returnStatus: true,
-      script: "./tests/scripts/should_run_slow_tests.py --pr '${pr_number}'",
-      label: 'Check if CI should run slow tests',
-    )
-  }
-  return result == 0
-}
-
-def cancel_previous_build() {
-  // cancel previous build if it is not on main.
-  if (env.BRANCH_NAME != 'main') {
-    def buildNumber = env.BUILD_NUMBER as int
-    // Milestone API allows us to cancel previous build
-    // with the same milestone number
-    if (buildNumber > 1) milestone(buildNumber - 1)
-    milestone(buildNumber)
-  }
-}
-
-def should_skip_ci(pr_number) {
-  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
-    // never skip CI on build sourced from a branch
-    return false
-  }
-  glob_skip_ci_code = sh (
-    returnStatus: true,
-    script: "./tests/scripts/git_skip_ci_globs.py",
-    label: 'Check if CI should be skipped due to changed files',
-  )
-  if (glob_skip_ci_code == 0) {
-    return true
-  }
-  withCredentials([string(
-    credentialsId: 'tvm-bot-jenkins-reader',
-    variable: 'TOKEN',
-    )]) {
-    // Exit code of 1 means run full CI (or the script had an error, so run
-    // full CI just in case). Exit code of 0 means skip CI.
-    git_skip_ci_code = sh (
-      returnStatus: true,
-      script: "./tests/scripts/git_skip_ci.py --pr '${pr_number}'",
-      label: 'Check if CI should be skipped',
-    )
-  }
-  return git_skip_ci_code == 0
-}
-
 // skips builds from branch indexing; sourced from https://www.jvt.me/posts/2020/02/23/jenkins-multibranch-skip-branch-index/
 // execute this before anything else, including requesting any time on an agent
 if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) {
@@ -188,217 +91,6 @@ if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) {
   return
 }
 
-cancel_previous_build()
-
-def lint() {
-stage('Lint') {
-  parallel(
-    {% call m.sharded_lint_step(name='Lint', num_shards=2, node='CPU-SMALL', ws='tvm/lint') %}
-      {% for image in images %}
-      {{ image.name }} = params.{{ image.name }}_param ?: {{ image.name }}
-      {% endfor %}
-
-      sh (script: """
-        echo "Docker images being used in this build:"
-        {% for image in images %}
-        echo " {{ image.name }} = ${ {{- image.name -}} }"
-        {% endfor %}
-      """, label: 'Docker image names')
-
-      is_docs_only_build = sh (
-        returnStatus: true,
-        script: './tests/scripts/git_change_docs.sh',
-        label: 'Check for docs only changes',
-      )
-      skip_ci = should_skip_ci(env.CHANGE_ID)
-      skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
-      rebuild_docker_images = sh (
-        returnStatus: true,
-        script: './tests/scripts/git_change_docker.sh',
-        label: 'Check for any docker changes',
-      )
-      if (skip_ci) {
-        // Don't rebuild when skipping CI
-        rebuild_docker_images = false
-      }
-      if (rebuild_docker_images) {
-        // Exit before linting so we can use the newly created Docker images
-        // to run the lint
-        return
-      }
-      sh (
-        script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh",
-        label: 'Run lint',
-      )
-    {% endcall %}
-  )
-}
-}
-
-// [note: method size]
-// This has to be extracted into a method due to JVM limitations on the size of
-// a method (so the code can't all be inlined)
-lint()
-
-def build_image(image_name) {
-  hash = sh(
-    returnStdout: true,
-    script: 'git log -1 --format=\'%h\''
-  ).trim()
-  def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}-${env.BUILD_NUMBER}"
-  sh(
-    script: "${docker_build} ${image_name} --spec ${full_name}",
-    label: 'Build docker image'
-  )
-  aws_account_id = sh(
-    returnStdout: true,
-    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
-    label: 'Get AWS ID'
-  ).trim()
-
-  try {
-    // Use a credential so Jenkins knows to scrub the AWS account ID which is nice
-    // (but so we don't have to rely it being hardcoded in Jenkins)
-    withCredentials([string(
-      credentialsId: 'aws-account-id',
-      variable: '_ACCOUNT_ID_DO_NOT_USE',
-      )]) {
-      withEnv([
-        "AWS_ACCOUNT_ID=${aws_account_id}",
-        'AWS_DEFAULT_REGION=us-west-2']) {
-        sh(
-          script: '''
-            set -x
-            aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com
-          ''',
-          label: 'Log in to ECR'
-        )
-        sh(
-          script: """
-            set -x
-            docker tag ${full_name} \$AWS_ACCOUNT_ID.dkr.ecr.\$AWS_DEFAULT_REGION.amazonaws.com/${full_name}
-            docker push \$AWS_ACCOUNT_ID.dkr.ecr.\$AWS_DEFAULT_REGION.amazonaws.com/${full_name}
-          """,
-          label: 'Upload image to ECR'
-        )
-      }
-    }
-  } finally {
-    sh(
-      script: 'rm -f ~/.docker/config.json',
-      label: 'Clean up login credentials'
-    )
-  }
-  sh(
-    script: "docker rmi ${full_name}",
-    label: 'Remove docker image'
-  )
-}
-
-if (rebuild_docker_images) {
-  stage('Docker Image Build') {
-    // TODO in a follow up PR: Find ecr tag and use in subsequent builds
-    parallel 'ci-lint': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          build_image('ci_lint')
-        }
-      }
-    }, 'ci-cpu': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          build_image('ci_cpu')
-        }
-      }
-    }, 'ci-gpu': {
-      node('GPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          build_image('ci_gpu')
-        }
-      }
-    }, 'ci-qemu': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          build_image('ci_qemu')
-        }
-      }
-    }, 'ci-i386': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          build_image('ci_i386')
-        }
-      }
-    }, 'ci-arm': {
-      node('ARM') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          build_image('ci_arm')
-        }
-      }
-    }, 'ci-wasm': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          build_image('ci_wasm')
-        }
-      }
-    }, 'ci-hexagon': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          build_image('ci_hexagon')
-        }
-      }
-    }
-  }
-  // // TODO: Once we are able to use the built images, enable this step
-  // // If the docker images changed, we need to run the image build before the lint
-  // // can run since it requires a base docker image. Most of the time the images
-  // // aren't build though so it's faster to use the same node that checks for
-  // // docker changes to run the lint in the usual case.
-  // stage('Sanity Check (re-run)') {
-  //   timeout(time: max_time, unit: 'MINUTES') {
-  //     node('CPU') {
-  //       ws({{ m.per_exec_ws('tvm/sanity') }}) {
-  //         init_git()
-  //         sh (
-  //           script: "${docker_run} ${ci_lint}  ./tests/scripts/task_lint.sh",
-  //           label: 'Run lint',
-  //         )
-  //       }
-  //     }
-  //   }
-  // }
-}
-
-// Run make. First try to do an incremental make from a previous workspace in hope to
-// accelerate the compilation. If something is wrong, clean the workspace and then
-// build from scratch.
-def make(docker_type, path, make_flag) {
-  timeout(time: max_time, unit: 'MINUTES') {
-    try {
-      cmake_build(docker_type, path, make_flag)
-      // always run cpp test when build
-    } catch (hudson.AbortException ae) {
-      // script exited due to user abort, directly throw instead of retry
-      if (ae.getMessage().contains('script returned exit code 143')) {
-        throw ae
-      }
-      echo 'Incremental compilation failed. Fall back to build from scratch'
-      sh (
-        script: "${docker_run} ${docker_type} ./tests/scripts/task_clean.sh ${path}",
-        label: 'Clear old cmake workspace',
-      )
-      cmake_build(docker_type, path, make_flag)
-    }
-  }
-}
-
 // Filenames for stashing between build and test steps
 {% set tvm_runtime = ['build/libtvm_runtime.so', 'build/config.cmake'] %}
 {% set tvm_lib = ['build/libtvm.so'] + tvm_runtime %}
@@ -407,503 +99,29 @@ def make(docker_type, path, make_flag) {
 {% set microtvm_template_projects = ['build/microtvm_template_projects',] %}
 s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
 
+// General note: Jenkins has limits on the size of a method (or top level code)
+// that are pretty strict, so most usage of groovy methods in these templates
+// are purely to satisfy the JVM
+{% include "jenkins/Prepare.groovy.j2" %}
+{% include "jenkins/DockerBuild.groovy.j2" %}
+{% include "jenkins/Lint.groovy.j2" %}
+{% include "jenkins/Build.groovy.j2" %}
+{% include "jenkins/Test.groovy.j2" %}
+{% include "jenkins/Deploy.groovy.j2" %}
 
-def ci_setup(image) {
-  sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_ci_setup.sh",
-    label: 'Set up CI environment',
-  )
-}
-
-def python_unittest(image) {
-  sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_python_unittest.sh",
-    label: 'Run Python unit tests',
-  )
-}
-
-def fsim_test(image) {
-  sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
-    label: 'Run VTA tests in FSIM',
-  )
-}
-
-def cmake_build(image, path, make_flag) {
-  sh (
-    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
-    label: 'Run cmake build',
-  )
-}
 
-def cpp_unittest(image) {
-  sh (
-    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh",
-    label: 'Build and run C++ tests',
-  )
-}
+cancel_previous_build()
 
+prepare()
 
-def add_microtvm_permissions() {
-  {% for folder in microtvm_template_projects %}
-  sh(
-    script: 'find {{ folder }} -type f | grep qemu-hack | xargs chmod +x',
-    label: 'Add execute permissions for microTVM files',
-  )
-  {% endfor %}
+if (rebuild_docker_images) {
+  build_docker_images()
 }
 
+lint()
 
-def build() {
-stage('Build') {
-  environment {
-    SKIP_SLOW_TESTS = "${skip_slow_tests}"
-  }
-  parallel(
-    'BUILD: GPU': {
-    if (!skip_ci) {
-      node('CPU-SMALL') {
-        ws({{ m.per_exec_ws('tvm/build-gpu') }}) {
-          init_git()
-          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
-          make("${ci_gpu} --no-gpu", 'build', '-j2')
-          {{ m.upload_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
-
-          // compiler test
-          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build2"
-          make("${ci_gpu} --no-gpu", 'build2', '-j2')
-          {{ m.upload_artifacts(tag='gpu2', filenames=tvm_multilib) }}
-        }
-      }
-    }
-  },
-  'BUILD: CPU': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws({{ m.per_exec_ws('tvm/build-cpu') }}) {
-          init_git()
-          sh (
-            script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
-            label: 'Create CPU cmake config',
-          )
-          make(ci_cpu, 'build', '-j2')
-          {{ m.upload_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
-          timeout(time: max_time, unit: 'MINUTES') {
-            ci_setup(ci_cpu)
-            // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
-            // TODO(@jroesch): need to resolve CI issue will turn back on in follow up patch
-            sh (script: "${docker_run} ${ci_cpu} ./tests/scripts/task_rust.sh", label: 'Rust build and test')
-          }
-        }
-      }
-    } else {
-      Utils.markStageSkippedForConditional('BUILD: CPU')
-    }
-  },
-  'BUILD: WASM': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws({{ m.per_exec_ws('tvm/build-wasm') }}) {
-          init_git()
-          sh (
-            script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
-            label: 'Create WASM cmake config',
-          )
-          make(ci_wasm, 'build', '-j2')
-          cpp_unittest(ci_wasm)
-          timeout(time: max_time, unit: 'MINUTES') {
-            ci_setup(ci_wasm)
-            sh (
-              script: "${docker_run} ${ci_wasm} ./tests/scripts/task_web_wasm.sh",
-              label: 'Run WASM lint and tests',
-            )
-          }
-        }
-      }
-    } else {
-      Utils.markStageSkippedForConditional('BUILD: WASM')
-    }
-  },
-  'BUILD: i386': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws({{ m.per_exec_ws('tvm/build-i386') }}) {
-          init_git()
-          sh (
-            script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
-            label: 'Create i386 cmake config',
-          )
-          make(ci_i386, 'build', '-j2')
-          {{ m.upload_artifacts(tag='i386', filenames=tvm_multilib_tsim) }}
-        }
-      }
-    } else {
-      Utils.markStageSkippedForConditional('BUILD: i386')
-    }
-  },
-  'BUILD: arm': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('ARM') {
-        ws({{ m.per_exec_ws('tvm/build-arm') }}) {
-          init_git()
-          sh (
-            script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
-            label: 'Create ARM cmake config',
-          )
-          make(ci_arm, 'build', '-j4')
-          {{ m.upload_artifacts(tag='arm', filenames=tvm_multilib) }}
-        }
-      }
-     } else {
-      Utils.markStageSkippedForConditional('BUILD: arm')
-    }
-  },
-  'BUILD: QEMU': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws({{ m.per_exec_ws('tvm/build-qemu') }}) {
-          init_git()
-          sh (
-            script: "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh build",
-            label: 'Create QEMU cmake config',
-          )
-          make(ci_qemu, 'build', '-j2')
-          {{ m.upload_artifacts(tag='qemu', filenames=tvm_lib, folders=microtvm_template_projects) }}
-        }
-      }
-     } else {
-      Utils.markStageSkippedForConditional('BUILD: QEMU')
-    }
-  },
-  'BUILD: Hexagon': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws({{ m.per_exec_ws('tvm/build-hexagon') }}) {
-          init_git()
-          sh (
-            script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
-            label: 'Create Hexagon cmake config',
-          )
-          make(ci_hexagon, 'build', '-j2')
-          {{ m.upload_artifacts(tag='hexagon', filenames=tvm_lib) }}
-        }
-      }
-     } else {
-      Utils.markStageSkippedForConditional('BUILD: Hexagon')
-    }
-  },
-  )
-}
-}
-
-// [note: method size]
 build()
 
-def test() {
-stage('Test') {
-  environment {
-    SKIP_SLOW_TESTS = "${skip_slow_tests}"
-  }
-  parallel(
-  {% call(shard_index, num_shards) m.sharded_test_step(
-    name="unittest: GPU",
-    num_shards=2,
-    node="GPU",
-    ws="tvm/ut-python-gpu",
-    platform="gpu",
-  ) %}
-    {% if shard_index == 1 %}
-    {{ m.download_artifacts(tag='gpu2', filenames=tvm_multilib) }}
-    cpp_unittest(ci_gpu)
-
-    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
-    ci_setup(ci_gpu)
-    cpp_unittest(ci_gpu)
-    {% else %}
-    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
-    ci_setup(ci_gpu)
-    {% endif %}
-    {% if shard_index == 2 or num_shards < 2 %}
-    sh (
-      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh",
-      label: 'Run Java unit tests',
-    )
-    {% endif %}
-    sh (
-      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
-      label: 'Run Python GPU unit tests',
-    )
-    sh (
-      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh",
-      label: 'Run Python GPU integration tests',
-    )
-  {% endcall %}
-  {% call(shard_index, num_shards) m.sharded_test_step(
-    name="integration: CPU",
-    node="CPU",
-      num_shards=2,
-      ws="tvm/integration-python-cpu",
-      platform="cpu",
-    ) %}
-    {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
-    ci_setup(ci_cpu)
-    sh (
-      script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-      label: 'Run CPU integration tests',
-    )
-  {% endcall %}
-  {% call m.test_step(
-    name="unittest: CPU",
-    node="CPU-SMALL",
-    ws="tvm/ut-python-cpu",
-    platform="cpu",
-  ) %}
-    {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
-    ci_setup(ci_cpu)
-    cpp_unittest(ci_cpu)
-    python_unittest(ci_cpu)
-    fsim_test(ci_cpu)
-    sh (
-      script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_tsim.sh",
-      label: 'Run VTA tests in TSIM',
-    )
-  {% endcall %}
-  {% call(shard_index, num_shards) m.sharded_test_step(
-    name="python: i386",
-    node="CPU-SMALL",
-    num_shards=3,
-    ws="tvm/integration-python-i386",
-    platform="i386",
-  ) %}
-    {{ m.download_artifacts(tag='i386', filenames=tvm_multilib) }}
-    ci_setup(ci_i386)
-    {% if shard_index == 1 %}
-    cpp_unittest(ci_i386)
-    {% endif %}
-    python_unittest(ci_i386)
-    sh (
-      script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
-      label: 'Run i386 integration tests',
-    )
-    fsim_test(ci_i386)
-  {% endcall %}
-  {% call(shard_index, num_shards) m.sharded_test_step(
-    name="test: Hexagon",
-    node="CPU-SMALL",
-    ws="tvm/test-hexagon",
-    platform="hexagon",
-    num_shards=4,
-  ) %}
-    {{ m.download_artifacts(tag='hexagon', filenames=tvm_lib) }}
-    ci_setup(ci_hexagon)
-    {% if shard_index == 1 %}
-    cpp_unittest(ci_hexagon)
-    {% endif %}
-    sh (
-      script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
-      label: 'Build Hexagon API',
-    )
-    sh (
-      script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-      label: 'Run Hexagon tests',
-    )
-  {% endcall %}
-  {% call m.test_step(
-    name="test: QEMU",
-    node="CPU-SMALL",
-    ws="tvm/test-qemu",
-    platform="qemu",
-  ) %}
-    {{ m.download_artifacts(tag='qemu', filenames=tvm_lib, folders=microtvm_template_projects) }}
-    add_microtvm_permissions()
-    ci_setup(ci_qemu)
-    cpp_unittest(ci_qemu)
-    sh (
-      script: "${docker_run} ${ci_qemu} ./tests/scripts/task_python_microtvm.sh",
-      label: 'Run microTVM tests',
-    )
-    sh (
-      script: "${docker_run} ${ci_qemu} ./tests/scripts/task_demo_microtvm.sh",
-      label: 'Run microTVM demos',
-    )
-  {% endcall %}
-  {% call m.test_step(
-    name="topi: aarch64",
-    node="ARM",
-    ws="tvm/ut-python-arm",
-    platform="arm",
-) %}
-    {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
-    ci_setup(ci_arm)
-    cpp_unittest(ci_arm)
-    sh (
-      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
-      label: 'Run test_arm_compute_lib test',
-    )
-    sh (
-      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
-      label: 'Run TOPI tests',
-    )
-  {% endcall %}
-  {% call(shard_index, num_shards) m.sharded_test_step(
-    name="integration: aarch64",
-    num_shards=2,
-    node="ARM", ws="tvm/ut-python-arm",
-    platform="arm",
-  ) %}
-    {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
-    ci_setup(ci_arm)
-    python_unittest(ci_arm)
-    sh (
-      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
-      label: 'Run CPU integration tests',
-    )
-  {% endcall %}
-  {% call(shard_index, num_shards) m.sharded_test_step(
-    name="topi: GPU",
-    node="GPU",
-    num_shards=2,
-    ws="tvm/topi-python-gpu",
-    platform="gpu",
-  ) %}
-    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
-    ci_setup(ci_gpu)
-    sh (
-      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
-      label: 'Run TOPI tests',
-    )
-  {% endcall %}
-  {% call(shard_index, num_shards) m.sharded_test_step(
-    name="frontend: GPU", node="GPU",
-    num_shards=3,
-    ws="tvm/frontend-python-gpu",
-    platform="gpu",
-  ) %}
-    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
-    ci_setup(ci_gpu)
-    sh (
-      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
-      label: 'Run Python frontend tests',
-    )
-  {% endcall %}
-  {% call m.test_step(
-    name="frontend: CPU",
-    node="CPU",
-    ws="tvm/frontend-python-cpu",
-    platform="cpu",
-) %}
-    {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib) }}
-    ci_setup(ci_cpu)
-    sh (
-      script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh",
-      label: 'Run Python frontend tests',
-    )
-  {% endcall %}
-  {% call m.test_step(
-    name="frontend: aarch64",
-    node="ARM",
-    ws="tvm/frontend-python-arm",
-    platform="arm",
-) %}
-    {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
-    ci_setup(ci_arm)
-    sh (
-      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
-      label: 'Run Python frontend tests',
-    )
-  {% endcall %}
-  'docs: GPU': {
-    if (!skip_ci) {
-      node('GPU') {
-        ws({{ m.per_exec_ws('tvm/docs-python-gpu') }}) {
-          init_git()
-          {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
-          add_microtvm_permissions()
-          timeout(time: 180, unit: 'MINUTES') {
-            ci_setup(ci_gpu)
-            sh (
-              script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_docs.sh",
-              label: 'Build docs',
-            )
-          }
-          {{ m.upload_artifacts(tag='docs', filenames=["docs.tgz"]) }}
-          archiveArtifacts(artifacts: 'docs.tgz', fingerprint: true)
-        }
-      }
-    }
-  },
-  )
-}
-}
-
-// [note: method size]
 test()
 
-/*
-stage('Build packages') {
-  parallel 'conda CPU': {
-    node('CPU') {
-      sh "${docker_run} tlcpack/conda-cpu ./conda/build_cpu.sh
-    }
-  },
-  'conda cuda': {
-    node('CPU') {
-      sh "${docker_run} tlcpack/conda-cuda90 ./conda/build_cuda.sh
-      sh "${docker_run} tlcpack/conda-cuda100 ./conda/build_cuda.sh
-    }
-  }
-// Here we could upload the packages to anaconda for releases
-// and/or the main branch
-}
-*/
-
-def deploy_docs() {
-  // Note: This code must stay in the Jenkinsfile to ensure that it runs
-  // from a trusted context only
-  sh(
-    script: '''
-      set -eux
-      rm -rf tvm-site
-      git clone -b $DOCS_DEPLOY_BRANCH --depth=1 https://github.com/apache/tvm-site
-      cd tvm-site
-      git status
-      git checkout -B $DOCS_DEPLOY_BRANCH
-
-      rm -rf docs
-      mkdir -p docs
-      tar xf ../docs.tgz -C docs
-      COMMIT=$(cat docs/commit_hash)
-      git add .
-      git config user.name tvm-bot
-      git config user.email 95660001+tvm-bot@users.noreply.github.com
-      git commit -m"deploying docs (apache/tvm@$COMMIT)"
-      git status
-    ''',
-    label: 'Unpack docs and update tvm-site'
-  )
-
-  withCredentials([string(
-    credentialsId: 'docs-push-token',
-    variable: 'GITHUB_TOKEN',
-    )]) {
-    sh(
-      script: '''
-        cd tvm-site
-        git remote add deploy https://$GITHUB_TOKEN:x-oauth-basic@github.com/apache/tvm-site.git
-        git push deploy $DOCS_DEPLOY_BRANCH
-      ''',
-      label: 'Upload docs to apache/tvm-site'
-    )
-  }
-}
-
-stage('Deploy') {
-  if (env.BRANCH_NAME == 'main' && env.DOCS_DEPLOY_ENABLED == 'yes') {
-    node('CPU') {
-      ws({{ m.per_exec_ws('tvm/deploy-docs') }}) {
-        {{ m.download_artifacts(tag='docs', filenames=["docs.tgz"]) }}
-        deploy_docs()
-      }
-    }
-  }
-}
+deploy()
diff --git a/jenkins/Lint.groovy.j2 b/jenkins/Lint.groovy.j2
new file mode 100644
index 0000000000000..61c13cd407d02
--- /dev/null
+++ b/jenkins/Lint.groovy.j2
@@ -0,0 +1,18 @@
+def lint() {
+  stage('Lint') {
+    parallel(
+      {% call m.sharded_lint_step(
+        name='Lint',
+        num_shards=2,
+        node='CPU-SMALL',
+        ws='tvm/lint',
+        )
+      %}
+        sh (
+          script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh",
+          label: 'Run lint',
+        )
+      {% endcall %}
+    )
+  }
+}
diff --git a/jenkins/Prepare.groovy.j2 b/jenkins/Prepare.groovy.j2
new file mode 100644
index 0000000000000..d7bf5e706b0be
--- /dev/null
+++ b/jenkins/Prepare.groovy.j2
@@ -0,0 +1,133 @@
+def per_exec_ws(folder) {
+  return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder
+}
+
+// initialize source codes
+def init_git() {
+  checkout scm
+
+  // Clear out all Docker images that aren't going to be used
+  sh(
+    script: "docker image ls --all --format {% raw %}'{{.Repository}}:{{.Tag}}  {{.ID}}'{% endraw %} | { grep -vE '{% for image in images %}{% raw %}${{% endraw %}{{ image.name }}{% raw %}}{% endraw %}{% if not loop.last %}|{% endif %}{% endfor %}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }",
+    label: 'Clean old Docker images',
+  )
+  // Add more info about job node
+  sh (
+    script: './tests/scripts/task_show_node_info.sh',
+    label: 'Show executor node info',
+  )
+
+  // Determine merge commit to use for all stages
+  sh (
+    script: 'git fetch origin main',
+    label: 'Fetch upstream',
+  )
+  if (upstream_revision == null) {
+    upstream_revision = sh(
+      script: 'git log -1 FETCH_HEAD --format=\'%H\'',
+      label: 'Determine upstream revision',
+      returnStdout: true,
+    ).trim()
+  }
+  sh (
+    script: "git -c user.name=TVM-Jenkins -c user.email=jenkins@tvm.apache.org merge ${upstream_revision}",
+    label: 'Merge to origin/main'
+  )
+
+  retry(5) {
+    timeout(time: 2, unit: 'MINUTES') {
+      sh (script: 'git submodule update --init -f', label: 'Update git submodules')
+    }
+  }
+}
+
+def should_skip_slow_tests(pr_number) {
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+  )]) {
+    // Exit code of 1 means run slow tests, exit code of 0 means skip slow tests
+    result = sh (
+      returnStatus: true,
+      script: "./tests/scripts/should_run_slow_tests.py --pr '${pr_number}'",
+      label: 'Check if CI should run slow tests',
+    )
+  }
+  return result == 0
+}
+
+def cancel_previous_build() {
+  // cancel previous build if it is not on main.
+  if (env.BRANCH_NAME != 'main') {
+    def buildNumber = env.BUILD_NUMBER as int
+    // Milestone API allows us to cancel previous build
+    // with the same milestone number
+    if (buildNumber > 1) milestone(buildNumber - 1)
+    milestone(buildNumber)
+  }
+}
+
+def should_skip_ci(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  glob_skip_ci_code = sh (
+    returnStatus: true,
+    script: "./tests/scripts/git_skip_ci_globs.py",
+    label: 'Check if CI should be skipped due to changed files',
+  )
+  if (glob_skip_ci_code == 0) {
+    return true
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'TOKEN',
+    )]) {
+    // Exit code of 1 means run full CI (or the script had an error, so run
+    // full CI just in case). Exit code of 0 means skip CI.
+    git_skip_ci_code = sh (
+      returnStatus: true,
+      script: "./tests/scripts/git_skip_ci.py --pr '${pr_number}'",
+      label: 'Check if CI should be skipped',
+    )
+  }
+  return git_skip_ci_code == 0
+}
+
+def prepare() {
+  stage('Prepare') {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/prepare") {
+        init_git()
+        {% for image in images %}
+        {{ image.name }} = params.{{ image.name }}_param ?: {{ image.name }}
+        {% endfor %}
+
+        sh (script: """
+          echo "Docker images being used in this build:"
+          {% for image in images %}
+          echo " {{ image.name }} = ${ {{- image.name -}} }"
+          {% endfor %}
+        """, label: 'Docker image names')
+
+        is_docs_only_build = sh (
+          returnStatus: true,
+          script: './tests/scripts/git_change_docs.sh',
+          label: 'Check for docs only changes',
+        )
+        skip_ci = should_skip_ci(env.CHANGE_ID)
+        skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
+        rebuild_docker_images = sh (
+          returnStatus: true,
+          script: './tests/scripts/git_change_docker.sh',
+          label: 'Check for any docker changes',
+        )
+        if (skip_ci) {
+          // Don't rebuild when skipping CI
+          rebuild_docker_images = false
+        }
+      }
+    }
+  }
+}
diff --git a/jenkins/README.md b/jenkins/README.md
new file mode 100644
index 0000000000000..454664b40c643
--- /dev/null
+++ b/jenkins/README.md
@@ -0,0 +1,28 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Jenkins CI
+
+The template files in this directory are used to generate the [`Jenkinsfile`](../Jenkinsfile) used by Jenkins to run CI jobs for each commit to PRs and branches.
+
+To regenerate the `Jenkinsfile`, run
+
+```bash
+pip install -r jenkins/requirements.txt
+python jenkins/generate.py
+```
+
diff --git a/jenkins/Test.groovy.j2 b/jenkins/Test.groovy.j2
new file mode 100644
index 0000000000000..b287c2a3156ec
--- /dev/null
+++ b/jenkins/Test.groovy.j2
@@ -0,0 +1,236 @@
+def test() {
+stage('Test') {
+  environment {
+    SKIP_SLOW_TESTS = "${skip_slow_tests}"
+  }
+  parallel(
+  {% call(shard_index, num_shards) m.sharded_test_step(
+    name="unittest: GPU",
+    num_shards=2,
+    node="GPU",
+    ws="tvm/ut-python-gpu",
+    platform="gpu",
+  ) %}
+    {% if shard_index == 1 %}
+    {{ m.download_artifacts(tag='gpu2', filenames=tvm_multilib) }}
+    cpp_unittest(ci_gpu)
+
+    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
+    ci_setup(ci_gpu)
+    cpp_unittest(ci_gpu)
+    {% else %}
+    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
+    ci_setup(ci_gpu)
+    {% endif %}
+    {% if shard_index == 2 or num_shards < 2 %}
+    sh (
+      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh",
+      label: 'Run Java unit tests',
+    )
+    {% endif %}
+    sh (
+      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
+      label: 'Run Python GPU unit tests',
+    )
+    sh (
+      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh",
+      label: 'Run Python GPU integration tests',
+    )
+  {% endcall %}
+  {% call(shard_index, num_shards) m.sharded_test_step(
+    name="integration: CPU",
+    node="CPU",
+      num_shards=2,
+      ws="tvm/integration-python-cpu",
+      platform="cpu",
+    ) %}
+    {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
+    ci_setup(ci_cpu)
+    sh (
+      script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
+      label: 'Run CPU integration tests',
+    )
+  {% endcall %}
+  {% call m.test_step(
+    name="unittest: CPU",
+    node="CPU-SMALL",
+    ws="tvm/ut-python-cpu",
+    platform="cpu",
+  ) %}
+    {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
+    ci_setup(ci_cpu)
+    cpp_unittest(ci_cpu)
+    python_unittest(ci_cpu)
+    fsim_test(ci_cpu)
+    sh (
+      script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_tsim.sh",
+      label: 'Run VTA tests in TSIM',
+    )
+  {% endcall %}
+  {% call(shard_index, num_shards) m.sharded_test_step(
+    name="python: i386",
+    node="CPU-SMALL",
+    num_shards=3,
+    ws="tvm/integration-python-i386",
+    platform="i386",
+  ) %}
+    {{ m.download_artifacts(tag='i386', filenames=tvm_multilib) }}
+    ci_setup(ci_i386)
+    {% if shard_index == 1 %}
+    cpp_unittest(ci_i386)
+    {% endif %}
+    python_unittest(ci_i386)
+    sh (
+      script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
+      label: 'Run i386 integration tests',
+    )
+    fsim_test(ci_i386)
+  {% endcall %}
+  {% call(shard_index, num_shards) m.sharded_test_step(
+    name="test: Hexagon",
+    node="CPU-SMALL",
+    ws="tvm/test-hexagon",
+    platform="hexagon",
+    num_shards=4,
+  ) %}
+    {{ m.download_artifacts(tag='hexagon', filenames=tvm_lib) }}
+    ci_setup(ci_hexagon)
+    {% if shard_index == 1 %}
+    cpp_unittest(ci_hexagon)
+    {% endif %}
+    sh (
+      script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
+      label: 'Build Hexagon API',
+    )
+    sh (
+      script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+      label: 'Run Hexagon tests',
+    )
+  {% endcall %}
+  {% call m.test_step(
+    name="test: QEMU",
+    node="CPU-SMALL",
+    ws="tvm/test-qemu",
+    platform="qemu",
+  ) %}
+    {{ m.download_artifacts(tag='qemu', filenames=tvm_lib, folders=microtvm_template_projects) }}
+    add_microtvm_permissions()
+    ci_setup(ci_qemu)
+    cpp_unittest(ci_qemu)
+    sh (
+      script: "${docker_run} ${ci_qemu} ./tests/scripts/task_python_microtvm.sh",
+      label: 'Run microTVM tests',
+    )
+    sh (
+      script: "${docker_run} ${ci_qemu} ./tests/scripts/task_demo_microtvm.sh",
+      label: 'Run microTVM demos',
+    )
+  {% endcall %}
+  {% call m.test_step(
+    name="topi: aarch64",
+    node="ARM",
+    ws="tvm/ut-python-arm",
+    platform="arm",
+) %}
+    {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
+    ci_setup(ci_arm)
+    cpp_unittest(ci_arm)
+    sh (
+      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
+      label: 'Run test_arm_compute_lib test',
+    )
+    sh (
+      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
+      label: 'Run TOPI tests',
+    )
+  {% endcall %}
+  {% call(shard_index, num_shards) m.sharded_test_step(
+    name="integration: aarch64",
+    num_shards=2,
+    node="ARM", ws="tvm/ut-python-arm",
+    platform="arm",
+  ) %}
+    {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
+    ci_setup(ci_arm)
+    python_unittest(ci_arm)
+    sh (
+      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
+      label: 'Run CPU integration tests',
+    )
+  {% endcall %}
+  {% call(shard_index, num_shards) m.sharded_test_step(
+    name="topi: GPU",
+    node="GPU",
+    num_shards=2,
+    ws="tvm/topi-python-gpu",
+    platform="gpu",
+  ) %}
+    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
+    ci_setup(ci_gpu)
+    sh (
+      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
+      label: 'Run TOPI tests',
+    )
+  {% endcall %}
+  {% call(shard_index, num_shards) m.sharded_test_step(
+    name="frontend: GPU", node="GPU",
+    num_shards=3,
+    ws="tvm/frontend-python-gpu",
+    platform="gpu",
+  ) %}
+    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
+    ci_setup(ci_gpu)
+    sh (
+      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+      label: 'Run Python frontend tests',
+    )
+  {% endcall %}
+  {% call m.test_step(
+    name="frontend: CPU",
+    node="CPU",
+    ws="tvm/frontend-python-cpu",
+    platform="cpu",
+) %}
+    {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib) }}
+    ci_setup(ci_cpu)
+    sh (
+      script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh",
+      label: 'Run Python frontend tests',
+    )
+  {% endcall %}
+  {% call m.test_step(
+    name="frontend: aarch64",
+    node="ARM",
+    ws="tvm/frontend-python-arm",
+    platform="arm",
+) %}
+    {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
+    ci_setup(ci_arm)
+    sh (
+      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
+      label: 'Run Python frontend tests',
+    )
+  {% endcall %}
+  'docs: GPU': {
+    if (!skip_ci) {
+      node('GPU') {
+        ws({{ m.per_exec_ws('tvm/docs-python-gpu') }}) {
+          init_git()
+          {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
+          add_microtvm_permissions()
+          timeout(time: 180, unit: 'MINUTES') {
+            ci_setup(ci_gpu)
+            sh (
+              script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_docs.sh",
+              label: 'Build docs',
+            )
+          }
+          {{ m.upload_artifacts(tag='docs', filenames=["docs.tgz"]) }}
+          archiveArtifacts(artifacts: 'docs.tgz', fingerprint: true)
+        }
+      }
+    }
+  },
+  )
+}
+}
diff --git a/tests/lint/rat-excludes b/tests/lint/rat-excludes
index 3dff79c565ce7..1cdb78e31913c 100644
--- a/tests/lint/rat-excludes
+++ b/tests/lint/rat-excludes
@@ -51,3 +51,11 @@ MANIFEST
 .bash_history
 rat-excludes
 Cargo.lock
+
+# Included template files
+Build.groovy.j2
+Deploy.groovy.j2
+DockerBuild.groovy.j2
+Lint.groovy.j2
+Prepare.groovy.j2
+Test.groovy.j2

From bbc6ba362f4ff223a6954f79cd237de25209ffbd Mon Sep 17 00:00:00 2001
From: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Date: Sat, 21 May 2022 09:57:37 +0800
Subject: [PATCH 53/59] [Meta Schedule] Add Auto-Thread Binding Rule (#11177)

The current meta-schedule uses a PostProc `RewriteUnboundBlock` to auto-bind blocks to threads. However, it's a post proc, which means there are no search opportunities, and always splits with `factor=1024`.

This PR adds a new search rule called `AutoBind` to do a similar thing to bind threads with sampled factors. Also with a corresponding mutator.

After applying this rule, we get some positive perf results (on RTX-3080):
Element-wise: from 2.76 us to 2.48 us
Conv2d Winograd: from 29.45 us to 18.96 us (ansor 22.00 us)
Resnet18: from  0.591 ms to 0.531 ms (ansor 0.565 ms)
---
 include/tvm/meta_schedule/mutator.h           |  10 +-
 include/tvm/meta_schedule/postproc.h          |   4 +-
 include/tvm/meta_schedule/schedule_rule.h     |   7 +
 python/tvm/meta_schedule/mutator/__init__.py  |   1 +
 .../mutator/mutate_thread_binding.py          |  32 +++
 .../postproc/rewrite_unbound_block.py         |   5 +-
 .../meta_schedule/schedule_rule/__init__.py   |   1 +
 .../meta_schedule/schedule_rule/auto_bind.py  |  49 +++++
 .../testing/conv2d_winograd_cpu.py            |   2 +-
 .../testing/conv2d_winograd_cuda.py           |   2 +-
 .../meta_schedule/testing/schedule_rule.py    |   8 +
 python/tvm/meta_schedule/tune.py              |   8 +-
 python/tvm/topi/cuda/conv2d_nhwc_winograd.py  |   2 +-
 python/tvm/topi/cuda/conv2d_winograd.py       |   2 +-
 python/tvm/topi/nn/conv2d.py                  |   7 +-
 .../mutator/mutate_thread_binding.cc          | 167 +++++++++++++++
 .../postproc/rewrite_unbound_block.cc         | 139 ++-----------
 src/meta_schedule/schedule_rule/auto_bind.cc  | 192 ++++++++++++++++++
 src/meta_schedule/schedule_rule/auto_bind.h   |  52 +++++
 src/meta_schedule/schedule_rule/winograd.cc   |  23 ++-
 ...meta_schedule_custom_rule_winograd_cuda.py |  96 ++++++++-
 ..._schedule_mutator_mutate_thread_binding.py |  86 ++++++++
 ...t_meta_schedule_schedule_rule_auto_bind.py |  75 +++++++
 23 files changed, 831 insertions(+), 139 deletions(-)
 create mode 100644 python/tvm/meta_schedule/mutator/mutate_thread_binding.py
 create mode 100644 python/tvm/meta_schedule/schedule_rule/auto_bind.py
 create mode 100644 src/meta_schedule/mutator/mutate_thread_binding.cc
 create mode 100644 src/meta_schedule/schedule_rule/auto_bind.cc
 create mode 100644 src/meta_schedule/schedule_rule/auto_bind.h
 create mode 100644 tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py
 create mode 100644 tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py

diff --git a/include/tvm/meta_schedule/mutator.h b/include/tvm/meta_schedule/mutator.h
index 002fa51ee5e3d..d80fa70eee8a2 100644
--- a/include/tvm/meta_schedule/mutator.h
+++ b/include/tvm/meta_schedule/mutator.h
@@ -119,13 +119,21 @@ class Mutator : public runtime::ObjectRef {
    * \return The created mutator.
    */
   TVM_DLL static Mutator MutateParallel(int64_t max_jobs_per_core);
-  /*! \brief Create a Mutator that mutates auto unroll step */
+  /*!
+   * \brief Create a Mutator that mutates auto unroll step
+   * \return The mutator created
+   */
   TVM_DLL static Mutator MutateUnroll();
   /*!
    * \brief Create a Mutator that mutates the outcome of SampleComputeLocation
    * \return The mutator created
    */
   TVM_DLL static Mutator MutateComputeLocation();
+  /*!
+   * \brief Create a Mutator that mutates auto thread binding.
+   * \return The mutator created
+   */
+  TVM_DLL static Mutator MutateThreadBinding();
   /*!
    * \brief Create a mutator with customized methods on the python-side.
    * \param f_initialize_with_tune_context The packed function of `InitializeWithTuneContext`.
diff --git a/include/tvm/meta_schedule/postproc.h b/include/tvm/meta_schedule/postproc.h
index 8b32ce460933c..195d558550170 100644
--- a/include/tvm/meta_schedule/postproc.h
+++ b/include/tvm/meta_schedule/postproc.h
@@ -144,10 +144,10 @@ class Postproc : public runtime::ObjectRef {
   TVM_DLL static Postproc RewriteReductionBlock();
   /*!
    * \brief Create a postprocessor that adds thread binding to unbound blocks
-   * \param max_threadblock The max number of threadblocks in the cuda device.
+   * \param max_threadblocks The max number of threadblocks in the cuda device.
    * \return The postprocessor created.
    */
-  TVM_DLL static Postproc RewriteUnboundBlock(int max_threadblock);
+  TVM_DLL static Postproc RewriteUnboundBlock(int max_threadblocks);
   /*!
    * \brief Create a postprocessor that applies tensorization to annotated blocks
    * \param vectorize_init_loop Whether or not vectorize the initialization loop produced by
diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h
index 2b2eefeb75742..b39c72e24db8e 100644
--- a/include/tvm/meta_schedule/schedule_rule.h
+++ b/include/tvm/meta_schedule/schedule_rule.h
@@ -212,6 +212,13 @@ class ScheduleRule : public runtime::ObjectRef {
                                                          int max_vectorize_extent,         //
                                                          Array<Integer> unroll_max_steps,  //
                                                          bool unroll_explicit);
+  /*!
+   * \brief Auto bind loops around the block to BlockIdx and ThreadIdx
+   * \param max_threadblocks The maximum number of threadblock on GPU
+   * \param thread_extents Candidates of thread axis extent.
+   * \return The schedule rule created
+   */
+  TVM_DLL static ScheduleRule AutoBind(int max_threadblocks, Array<Integer> thread_extents);
   /*!
    * \brief Create a schedule rule with customized methods on the python-side.
    * \param f_initialize_with_tune_context The packed function of `InitializeWithTuneContext`.
diff --git a/python/tvm/meta_schedule/mutator/__init__.py b/python/tvm/meta_schedule/mutator/__init__.py
index e534ba14346ee..a0f7bac357680 100644
--- a/python/tvm/meta_schedule/mutator/__init__.py
+++ b/python/tvm/meta_schedule/mutator/__init__.py
@@ -22,5 +22,6 @@
 from .mutator import Mutator, PyMutator
 from .mutate_compute_location import MutateComputeLocation
 from .mutate_tile_size import MutateTileSize
+from .mutate_thread_binding import MutateThreadBinding
 from .mutate_parallel import MutateParallel
 from .mutate_unroll import MutateUnroll
diff --git a/python/tvm/meta_schedule/mutator/mutate_thread_binding.py b/python/tvm/meta_schedule/mutator/mutate_thread_binding.py
new file mode 100644
index 0000000000000..6a2553f94346b
--- /dev/null
+++ b/python/tvm/meta_schedule/mutator/mutate_thread_binding.py
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Mutator that mutates the thread binding extent"""
+from tvm._ffi.registry import register_object
+
+from .. import _ffi_api
+from .mutator import Mutator
+
+
+@register_object("meta_schedule.MutateThreadBinding")
+class MutateThreadBinding(Mutator):
+    """Mutator that mutates the binding extent"""
+
+    def __init__(self) -> None:
+        """Mutator that mutates the binding extent"""
+        self.__init_handle_by_constructor__(
+            _ffi_api.MutateThreadBinding,  # type: ignore # pylint: disable=no-member
+        )
diff --git a/python/tvm/meta_schedule/postproc/rewrite_unbound_block.py b/python/tvm/meta_schedule/postproc/rewrite_unbound_block.py
index c89bc4b0369ab..aef5bca690e47 100644
--- a/python/tvm/meta_schedule/postproc/rewrite_unbound_block.py
+++ b/python/tvm/meta_schedule/postproc/rewrite_unbound_block.py
@@ -17,6 +17,7 @@
 """A postprocessor that adds thread binding to unbound blocks"""
 
 from tvm._ffi.registry import register_object
+
 from .. import _ffi_api
 from .postproc import Postproc
 
@@ -25,8 +26,8 @@
 class RewriteUnboundBlock(Postproc):
     """A postprocessor that adds thread binding to unbound blocks"""
 
-    def __init__(self, max_threadblock: int = 256) -> None:
+    def __init__(self, max_threadblocks: int = 256) -> None:
         self.__init_handle_by_constructor__(
             _ffi_api.PostprocRewriteUnboundBlock,  # type: ignore # pylint: disable=no-member
-            max_threadblock,
+            max_threadblocks,
         )
diff --git a/python/tvm/meta_schedule/schedule_rule/__init__.py b/python/tvm/meta_schedule/schedule_rule/__init__.py
index a958fdc39db1f..18fc1de78c7b2 100644
--- a/python/tvm/meta_schedule/schedule_rule/__init__.py
+++ b/python/tvm/meta_schedule/schedule_rule/__init__.py
@@ -20,6 +20,7 @@
 blocks in a schedule. See also PostOrderApply.
 """
 from .add_rfactor import AddRFactor
+from .auto_bind import AutoBind
 from .auto_inline import AutoInline
 from .cross_thread_reduction import CrossThreadReduction
 from .multi_level_tiling import MultiLevelTiling, MultiLevelTilingWithIntrin, ReuseType
diff --git a/python/tvm/meta_schedule/schedule_rule/auto_bind.py b/python/tvm/meta_schedule/schedule_rule/auto_bind.py
new file mode 100644
index 0000000000000..c211093e92758
--- /dev/null
+++ b/python/tvm/meta_schedule/schedule_rule/auto_bind.py
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Auto-bind Rule that binds blocks to threads if needed"""
+from typing import List, Optional
+
+from tvm._ffi import register_object
+
+from .. import _ffi_api
+from .schedule_rule import ScheduleRule
+
+
+@register_object("meta_schedule.AutoBind")
+class AutoBind(ScheduleRule):
+    """Auto bind loops around the block to BlockIdx and ThreadIdx
+
+    Parameters
+    ----------
+    max_threadblocks: int
+        The maximum number of threadblock on GPU.
+    thread_extents: Optional[List[int]]
+        Candidates of thread axis extent.
+    """
+
+    def __init__(
+        self,
+        max_threadblocks: int = 256,
+        thread_extents: Optional[List[int]] = None,
+    ) -> None:
+        if thread_extents is None:
+            thread_extents = [32, 64, 128, 256, 512, 1024]
+        self.__init_handle_by_constructor__(
+            _ffi_api.ScheduleRuleAutoBind,  # type: ignore # pylint: disable=no-member
+            max_threadblocks,
+            thread_extents,
+        )
diff --git a/python/tvm/meta_schedule/testing/conv2d_winograd_cpu.py b/python/tvm/meta_schedule/testing/conv2d_winograd_cpu.py
index 261768c4897bf..d6242020726b0 100644
--- a/python/tvm/meta_schedule/testing/conv2d_winograd_cpu.py
+++ b/python/tvm/meta_schedule/testing/conv2d_winograd_cpu.py
@@ -131,7 +131,7 @@ def conv2d_winograd_cpu(
             vh, vw, p_3, co_1, r_a_1, r_b_1 = T.axis.remap(
                 "SSSSRR", [i0_7, i1_7, i2_5, i3_5, i4_2, i5_1]
             )
-            T.block_attr({"schedule_rule": "meta_schedule.winograd_inverse"})
+            T.block_attr({"schedule_rule": "meta_schedule.winograd_inverse.llvm"})
             T.reads(
                 [
                     inverse[vh, vw, p_3, co_1],
diff --git a/python/tvm/meta_schedule/testing/conv2d_winograd_cuda.py b/python/tvm/meta_schedule/testing/conv2d_winograd_cuda.py
index 530eadafc0f38..e737f9b04e622 100644
--- a/python/tvm/meta_schedule/testing/conv2d_winograd_cuda.py
+++ b/python/tvm/meta_schedule/testing/conv2d_winograd_cuda.py
@@ -132,7 +132,7 @@ def conv2d_winograd_cuda(  # type: ignore
             vh, vw, p_3, co_1, r_a_1, r_b_1 = T.axis.remap(
                 "SSSSRR", [i0_7, i1_7, i2_5, i3_5, i4_2, i5_1]
             )
-            T.block_attr({"schedule_rule": "meta_schedule.winograd_inverse"})
+            T.block_attr({"schedule_rule": "meta_schedule.winograd_inverse.cuda"})
             T.reads(
                 [
                     inverse[vh, vw, p_3, co_1],
diff --git a/python/tvm/meta_schedule/testing/schedule_rule.py b/python/tvm/meta_schedule/testing/schedule_rule.py
index b149f20c52e3e..e159bfaaaa5ae 100644
--- a/python/tvm/meta_schedule/testing/schedule_rule.py
+++ b/python/tvm/meta_schedule/testing/schedule_rule.py
@@ -17,6 +17,7 @@
 """Default schedule rules"""
 from tvm.meta_schedule.schedule_rule import (
     AddRFactor,
+    AutoBind,
     AutoInline,
     CrossThreadReduction,
     MultiLevelTiling,
@@ -28,6 +29,13 @@
 from tvm.target import Target
 
 
+def auto_bind(target: Target) -> ScheduleRule:
+    """Default schedule rules for auto bind"""
+    if target.kind.name == "cuda":
+        return AutoBind(max_threadblocks=256, thread_extents=[32, 64, 128, 256, 512, 1024])
+    raise NotImplementedError(f"{target.kind.name} is not supported")
+
+
 def auto_inline(target: Target) -> ScheduleRule:
     """Default schedule rules for auto inline"""
     if target.kind.name == "llvm":
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index 270c0dab8db43..9af237b3b7b86 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -156,6 +156,10 @@ def _sch_rules() -> List[ScheduleRule]:
                 unroll_max_steps=[0, 16, 64, 512, 1024],
                 unroll_explicit=True,
             ),
+            M.AutoBind(
+                max_threadblocks=256,
+                thread_extents=[32, 64, 128, 256, 512, 1024],
+            ),
         ]
 
     @staticmethod
@@ -177,7 +181,8 @@ def _mutator_probs() -> Dict[Mutator, float]:
 
         return {
             M.MutateTileSize(): 0.9,
-            M.MutateUnroll(): 0.1,
+            M.MutateUnroll(): 0.08,
+            M.MutateThreadBinding(): 0.02,
         }
 
 
@@ -842,6 +847,7 @@ def tune_relay(
     """
     # pylint: disable=import-outside-toplevel
     from tvm.relay import build as relay_build
+
     from .relay_integration import extract_task_from_relay
 
     # pylint: disable=protected-access, enable=import-outside-toplevel
diff --git a/python/tvm/topi/cuda/conv2d_nhwc_winograd.py b/python/tvm/topi/cuda/conv2d_nhwc_winograd.py
index 80745a90d9ff0..8accbbe532737 100644
--- a/python/tvm/topi/cuda/conv2d_nhwc_winograd.py
+++ b/python/tvm/topi/cuda/conv2d_nhwc_winograd.py
@@ -440,7 +440,7 @@ def nhwc_winograd_cuda(
             bgemm[r_a][r_b][p][co] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]
         ),
         name="inverse",
-        attrs={"schedule_rule": "meta_schedule.winograd_inverse"},
+        attrs={"schedule_rule": "meta_schedule.winograd_inverse.cuda"},
     )
 
     # Output
diff --git a/python/tvm/topi/cuda/conv2d_winograd.py b/python/tvm/topi/cuda/conv2d_winograd.py
index 4ff3f52b998f9..d2b373ba87a7d 100644
--- a/python/tvm/topi/cuda/conv2d_winograd.py
+++ b/python/tvm/topi/cuda/conv2d_winograd.py
@@ -152,7 +152,7 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype, pre_
             bgemm[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]
         ),
         name="inverse",
-        attrs={"schedule_rule": "meta_schedule.winograd_inverse"},
+        attrs={"schedule_rule": "meta_schedule.winograd_inverse.cuda"},
     )
 
     # output
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index c27ea81144ac2..b7ae9b3e1cd7c 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -1096,6 +1096,11 @@ def _conv2d_winograd_nhwc_impl(
         bgemm = auto_scheduler.rewrite_compute_body(bgemm, auto_scheduler_rewritten_layout)
 
     # inverse transform
+    if target is not None:
+        target_kind = "meta_schedule.winograd_inverse." + target.kind.name
+    else:
+        target_kind = "None"
+
     r_a = te.reduce_axis((0, alpha), "r_a")
     r_b = te.reduce_axis((0, alpha), "r_b")
     inverse = te.compute(
@@ -1106,7 +1111,7 @@ def _conv2d_winograd_nhwc_impl(
         name="inverse",
         attrs={
             "auto_scheduler_simplify_const_tensor_indices": ["vh", "vw", "r_a", "r_b"],
-            "schedule_rule": "meta_schedule.winograd_inverse",
+            "schedule_rule": target_kind,
         },
         # the attrs are necessary hints for the auto-scheduler
     )
diff --git a/src/meta_schedule/mutator/mutate_thread_binding.cc b/src/meta_schedule/mutator/mutate_thread_binding.cc
new file mode 100644
index 0000000000000..41207162ee1d4
--- /dev/null
+++ b/src/meta_schedule/mutator/mutate_thread_binding.cc
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "../utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+using tir::Instruction;
+using tir::InstructionKind;
+using tir::Trace;
+
+/*! \brief A mutator that mutates the thread binding factor decision of SampleCategorical */
+class MutateThreadBindingNode : public MutatorNode {
+ public:
+  /*! \brief JSON representation of the workload */
+  std::string json_mod_;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {}
+  static constexpr const char* _type_key = "meta_schedule.MutateThreadBinding";
+  TVM_DECLARE_FINAL_OBJECT_INFO(MutateThreadBindingNode, MutatorNode);
+
+ public:
+  // Inherit from `MutatorNode`
+  void InitializeWithTuneContext(const TuneContext& context) final {
+    this->json_mod_ = SaveJSON(context->mod.value());
+  }
+  // Inherit from `MutatorNode`
+  Optional<Trace> Apply(const Trace& trace, TRandState* rand_state) final;
+
+ private:
+  struct Candidate {
+    /*! \brief The sampling instruction to be mutated */
+    Instruction inst;
+    /*! \brief The probability */
+    std::vector<double> probs;
+    /*! \brief The decision made */
+    int decision;
+
+    explicit Candidate(Instruction inst, std::vector<double> probs, int decision)
+        : inst(std::move(inst)), probs(std::move(probs)), decision(std::move(decision)) {}
+  };
+
+  std::vector<Candidate> FindCandidates(const Trace& trace, TRandState* rand_state);
+};
+
+/*!
+ * \brief Find Candidate with the following pattern:
+ * \code
+ * v = sch.sample_categorical(...)
+ * l1, l2 = sch.split(loop=l0, factors=[None, v])
+ * sch.bind(loop=l2, thread_axis="threadIdx.x")
+ * \endcode
+ *
+ * \param trace The trace from which to find the instructions
+ * \return All the candidate instructions
+ */
+std::vector<MutateThreadBindingNode::Candidate> MutateThreadBindingNode::FindCandidates(
+    const Trace& trace, TRandState* rand_state) {
+  using tir::InstructionNode;
+
+  static InstructionKind inst_sample_categorical = InstructionKind::Get("SampleCategorical");
+  static InstructionKind inst_split = InstructionKind::Get("Split");
+  static InstructionKind inst_bind = InstructionKind::Get("Bind");
+
+  std::vector<MutateThreadBindingNode::Candidate> candidates;
+  std::unordered_map<const PrimExprNode*, const tir::InstructionNode*> sample_insts;
+  std::unordered_map<const tir::LoopRVNode*, const tir::InstructionNode*> sampled_split_insts;
+  std::vector<const InstructionNode*> bind_insts;
+
+  auto is_split_by_sample = [&sample_insts](const Instruction& inst) -> bool {
+    if (!inst->kind.same_as(inst_split)) {
+      return false;
+    }
+    // Only consider cases with 2 factors and the first one is None
+    if (inst->inputs.size() != 3 || inst->inputs[1].defined()) return false;
+    ICHECK(inst->inputs[2].defined());
+
+    return sample_insts.find(Downcast<PrimExpr>(inst->inputs[2]).get()) != sample_insts.end();
+  };
+
+  auto is_thread_binding_by_sample = [&sampled_split_insts](const Instruction& inst) -> bool {
+    if (!inst->kind.same_as(inst_bind)) {
+      return false;
+    }
+    ICHECK_EQ(inst->inputs.size(), 1);
+    ICHECK_EQ(inst->attrs.size(), 1);
+    if (Downcast<String>(inst->attrs[0]) != "threadIdx.x") return false;
+
+    return sampled_split_insts.find(Downcast<tir::LoopRV>(inst->inputs[0]).get()) !=
+           sampled_split_insts.end();
+  };
+
+  for (const Instruction& inst : trace->insts) {
+    if (inst->kind.same_as(inst_sample_categorical)) {
+      ICHECK_EQ(inst->outputs.size(), 1);
+      const PrimExprNode* var_rv = TVM_TYPE_AS(var_rv, inst->outputs[0], PrimExprNode);
+      sample_insts[var_rv] = inst.get();
+    } else if (is_split_by_sample(inst)) {
+      CHECK_EQ(inst->outputs.size(), 2);
+      // Only consider the inner loop, which can be bound to threadIdx.x
+      const tir::LoopRVNode* var_rv = TVM_TYPE_AS(var_rv, inst->outputs[1], tir::LoopRVNode);
+      sampled_split_insts[var_rv] = inst.get();
+    } else if (is_thread_binding_by_sample(inst)) {
+      bind_insts.push_back(inst.get());
+    }
+  }
+
+  for (const InstructionNode* bind_inst : bind_insts) {
+    const auto* loop_rv = TVM_TYPE_AS(loop_rv, bind_inst->inputs[0], tir::LoopRVNode);
+    auto split_it = sampled_split_insts.find(loop_rv);
+    ICHECK(split_it != sampled_split_insts.end());
+    const InstructionNode* split_inst = split_it->second;
+
+    const auto* expr_rv = TVM_TYPE_AS(expr_rv, split_inst->inputs[2], PrimExprNode);
+    auto sample_it = sample_insts.find(expr_rv);
+    ICHECK(sample_it != sample_insts.end());
+    const InstructionNode* sample_inst = sample_it->second;
+
+    int decision = Downcast<Integer>(trace->decisions[GetRef<Instruction>(sample_inst)])->value;
+
+    std::vector<double> probs =
+        support::AsVector<FloatImm, double>(Downcast<Array<FloatImm>>(sample_inst->attrs[1]));
+
+    candidates.emplace_back(GetRef<Instruction>(sample_inst), probs, decision);
+  }
+  return candidates;
+}
+
+Optional<Trace> MutateThreadBindingNode::Apply(const Trace& trace, TRandState* rand_state) {
+  std::vector<Candidate> candidates = FindCandidates(trace, rand_state);
+  if (candidates.empty()) {
+    return NullOpt;
+  }
+  Candidate candidate = candidates[tir::SampleInt(rand_state, 0, candidates.size())];
+  // Remove the current decision
+  candidate.probs.erase(candidate.probs.begin() + candidate.decision);
+  int result = tir::MakeMultinomialSampler(rand_state, candidate.probs)();
+  if (result >= candidate.decision) {
+    result += 1;
+  }
+  return trace->WithDecision(candidate.inst, Integer(result), /*remove_postproc=*/true);
+}
+
+Mutator Mutator::MutateThreadBinding() { return Mutator(make_object<MutateThreadBindingNode>()); }
+
+TVM_REGISTER_NODE_TYPE(MutateThreadBindingNode);
+TVM_REGISTER_GLOBAL("meta_schedule.MutateThreadBinding")
+    .set_body_typed(Mutator::MutateThreadBinding);
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/meta_schedule/postproc/rewrite_unbound_block.cc b/src/meta_schedule/postproc/rewrite_unbound_block.cc
index 73dc89d30e1fb..183f04e7ba239 100644
--- a/src/meta_schedule/postproc/rewrite_unbound_block.cc
+++ b/src/meta_schedule/postproc/rewrite_unbound_block.cc
@@ -16,84 +16,12 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+#include "../schedule_rule/auto_bind.h"
 #include "../utils.h"
 
 namespace tvm {
 namespace tir {
 
-/*! \brief The rewrite type for an unbound block */
-enum class BindType : int32_t {
-  /*! \brief No additional thread binding is needed */
-  kNoBind = 0,
-  /*! \brief Need to bind to blockIdx */
-  kBindBlock = 1,
-  /*! \brief Need to bind to both blockIdx and threadIdx */
-  kBindBlockThread = 2,
-};
-
-/*!
- * \brief Check the combination of bindings to be added to the block
- * \param block_sref The block to be checked
- * \param fuse_first_num The number of loops to be fused
- * \return The type of binding to be added to the block
- */
-BindType GetBindType(const StmtSRef& block_sref, int* fuse_first_num) {
-  Array<StmtSRef> loops = tir::GetLoops(block_sref);
-  int n = loops.size();
-  if (n == 0) {
-    return BindType::kNoBind;
-  }
-  int i_block_idx = -1;
-  int i_thread_idx = -1;
-  int i_multi_child = -1;
-  int i_spatial_loop = -1;
-  for (int i = 0; i < n; ++i) {
-    const StmtSRef& loop_sref = loops[i];
-    const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref);
-    runtime::ThreadScope thread_scope = GetThreadScope(loop);
-    if (IsBlockIdx(thread_scope)) {
-      if (i_block_idx == -1) {
-        i_block_idx = i;
-      }
-    }
-    if (IsThreadIdx(thread_scope)) {
-      if (i_thread_idx == -1) {
-        i_thread_idx = i;
-      }
-    }
-    if (loop->kind != tir::ForKind::kSerial) {
-      if (i_multi_child == -1) {
-        i_multi_child = i;
-      }
-    }
-    if (!IsSingleStmt(loop->body)) {
-      if (i_multi_child == -1) {
-        i_multi_child = i + 1;
-      }
-    }
-    if (tir::GetLoopIterType(loop_sref) == IterVarType::kDataPar) {
-      if (i_spatial_loop == i - 1) {
-        ++i_spatial_loop;
-      }
-    }
-  }
-  if (i_multi_child == -1) {
-    i_multi_child = n;
-  }
-  if ((i_block_idx != -1 && i_thread_idx != -1) || i_spatial_loop == -1) {
-    return BindType::kNoBind;
-  } else if (i_block_idx != -1 && i_thread_idx == -1) {
-    ICHECK(false) << "Unsupported case, where blockIdx is bound but threadIdx is not";
-    throw;
-  } else if (i_block_idx == -1 && i_thread_idx != -1) {
-    *fuse_first_num = std::min(std::min(i_multi_child, i_thread_idx), i_spatial_loop + 1);
-    return BindType::kBindBlock;
-  } else {  // i_block_idx == -1 && i_thread_idx == -1
-    *fuse_first_num = std::min(i_multi_child, i_spatial_loop + 1);
-    return BindType::kBindBlockThread;
-  }
-}
-
 /*! \brief Find all the blocks that are not bound */
 class UnboundBlockFinder : private StmtVisitor {
  public:
@@ -159,11 +87,11 @@ class RewriteUnboundBlockNode : public PostprocNode {
   // Inherited from PostprocNode
   void InitializeWithTuneContext(const TuneContext& context) final {
     CHECK(context->target.defined()) << "ValueError: target is not defined";
-    Optional<Integer> max_num_threads =
+    Optional<Integer> max_threads_per_block =
         context->target.value()->GetAttr<Integer>("max_threads_per_block");
-    CHECK(max_num_threads.defined())
+    CHECK(max_threads_per_block.defined())
         << "ValueError: missing attribute `max_threads_per_block` in the target";
-    this->max_num_threads_ = max_num_threads.value();
+    this->max_threads_per_block_ = max_threads_per_block.value();
   }
 
   // Inherited from PostprocNode
@@ -171,13 +99,13 @@ class RewriteUnboundBlockNode : public PostprocNode {
 
  public:
   /*! \brief The max number of threads per block from Target */
-  int max_num_threads_ = -1;
+  int max_threads_per_block_ = -1;
   /*! \brief The max number of threadblocks in the cuda device */
-  int max_threadblock_ = -1;
+  int max_threadblocks_ = -1;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
-    // `max_num_threads_` is not visited
-    // `max_threadblock_` is not visited
+    // `max_threads_per_block_` is not visited
+    // `max_threadblocks_` is not visited
   }
 
   static constexpr const char* _type_key = "meta_schedule.RewriteUnboundBlock";
@@ -186,61 +114,28 @@ class RewriteUnboundBlockNode : public PostprocNode {
 
 bool RewriteUnboundBlockNode::Apply(const tir::Schedule& sch) {
   using tir::BlockRV;
+  using tir::ExprRV;
   using tir::LoopRV;
   using tir::Schedule;
-  ICHECK_NE(this->max_num_threads_, -1);
+  ICHECK_NE(this->max_threads_per_block_, -1);
+  auto get_factor = [t = this->max_threads_per_block_](int max_extent) -> ExprRV {
+    return Integer(std::min(t, max_extent));
+  };
   std::vector<std::pair<tir::StmtSRef, String>> unbound_blocks =
       tir::UnboundBlockFinder::Find(sch->state());
   for (const auto& kv : unbound_blocks) {
     tir::StmtSRef block_sref = kv.first;
     String global_var_name = kv.second;
-    int fuse_first_num = 0;
-    tir::BindType bind_type = tir::GetBindType(block_sref, &fuse_first_num);
-    if (bind_type == tir::BindType::kNoBind) {
-      continue;
-    }
     BlockRV block_rv = GetRVFromSRef(sch, block_sref, global_var_name);
-    Array<LoopRV> loop_rvs = sch->GetLoops(block_rv);
-    LoopRV fused = sch->Fuse({loop_rvs.begin(), loop_rvs.begin() + fuse_first_num});
-    if (bind_type == tir::BindType::kBindBlock) {
-      sch->Bind(fused, "blockIdx.x");
-    } else if (bind_type == tir::BindType::kBindBlockThread) {
-      int64_t extent_size = 0;
-      Array<LoopRV> splits;
-      if (const int64_t* extent_ptr = tir::GetLoopIntExtent(sch->Get(fused).get())) {
-        extent_size = *extent_ptr;
-        if (extent_size > max_threadblock_ * max_num_threads_) {
-          splits =
-              sch->Split(fused, {NullOpt, Integer(max_threadblock_), Integer(max_num_threads_)});
-          ICHECK_EQ(splits.size(), 3);
-          sch->Reorder({splits[1], splits[2], splits[0]});
-          sch->Bind(splits[1], "blockIdx.x");
-          sch->Bind(splits[2], "threadIdx.x");
-        } else {
-          ICHECK_NE(extent_size, 0);
-          splits = sch->Split(
-              fused,
-              {NullOpt, Integer(std::min(static_cast<int64_t>(max_num_threads_), extent_size))});
-          ICHECK_EQ(splits.size(), 2);
-          sch->Bind(splits[0], "blockIdx.x");
-          sch->Bind(splits[1], "threadIdx.x");
-        }
-      } else {
-        // loop is dynamic, returns nullptr
-        splits = sch->Split(fused, {NullOpt, Integer(max_num_threads_)});
-        ICHECK_EQ(splits.size(), 2);
-        sch->Bind(splits[0], "blockIdx.x");
-        sch->Bind(splits[1], "threadIdx.x");
-      }
-    }
+    BindBlockThreadIdx(sch, block_rv, max_threadblocks_, max_threads_per_block_, get_factor);
   }
   return true;
 }
 
-Postproc Postproc::RewriteUnboundBlock(int max_threadblock) {
+Postproc Postproc::RewriteUnboundBlock(int max_threadblocks) {
   ObjectPtr<RewriteUnboundBlockNode> n = make_object<RewriteUnboundBlockNode>();
-  n->max_threadblock_ = max_threadblock;
-  n->max_num_threads_ = -1;
+  n->max_threadblocks_ = max_threadblocks;
+  n->max_threads_per_block_ = -1;
   return Postproc(n);
 }
 
diff --git a/src/meta_schedule/schedule_rule/auto_bind.cc b/src/meta_schedule/schedule_rule/auto_bind.cc
new file mode 100644
index 0000000000000..9c16856557e00
--- /dev/null
+++ b/src/meta_schedule/schedule_rule/auto_bind.cc
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "./auto_bind.h"
+
+#include <algorithm>
+#include <limits>
+
+#include "../utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+void BindBlockThreadIdx(const tir::Schedule& sch, const tir::BlockRV& block_rv,
+                        int64_t max_threadblocks, int64_t max_threads_per_block,
+                        std::function<tir::ExprRV(int64_t)> get_factor) {
+  using namespace tvm::tir;
+  Array<StmtSRef> loops = tir::GetLoops(sch->GetSRef(block_rv));
+  int n = loops.size();
+  if (n == 0) {
+    return;
+  }
+  int i_block_idx = -1;
+  int i_thread_idx = -1;
+  int i_multi_child = -1;
+  int i_spatial_loop = -1;
+  for (int i = 0; i < n; ++i) {
+    const StmtSRef& loop_sref = loops[i];
+    const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref);
+    runtime::ThreadScope thread_scope = GetThreadScope(loop);
+    if (IsBlockIdx(thread_scope)) {
+      if (i_block_idx == -1) {
+        i_block_idx = i;
+      }
+    }
+    if (IsThreadIdx(thread_scope)) {
+      if (i_thread_idx == -1) {
+        i_thread_idx = i;
+      }
+    }
+    if (loop->kind != ForKind::kSerial) {
+      if (i_multi_child == -1) {
+        i_multi_child = i;
+      }
+    }
+    if (!IsSingleStmt(loop->body)) {
+      if (i_multi_child == -1) {
+        i_multi_child = i + 1;
+      }
+    }
+    if (GetLoopIterType(loop_sref) == IterVarType::kDataPar) {
+      if (i_spatial_loop == i - 1) {
+        ++i_spatial_loop;
+      }
+    }
+  }
+  if (i_multi_child == -1) {
+    i_multi_child = n;
+  }
+  if ((i_block_idx != -1 && i_thread_idx != -1) || i_spatial_loop == -1) {
+    return;
+  }
+  if (i_block_idx != -1 && i_thread_idx == -1) {
+    ICHECK(false) << "Unsupported case, where blockIdx is bound but threadIdx is not";
+    throw;
+  }
+  LoopRV loop_rv{nullptr};
+  if (i_block_idx == -1 && i_thread_idx != -1) {
+    int num_fuse = std::min(std::min(i_multi_child, i_thread_idx), i_spatial_loop + 1);
+    Array<LoopRV> loop_rvs = sch->GetLoops(block_rv);
+    loop_rv = sch->Fuse({loop_rvs.begin(), loop_rvs.begin() + num_fuse});
+    sch->Bind(loop_rv, "blockIdx.x");
+    return;
+  } else {  // i_block_idx == -1 && i_thread_idx == -1
+    Array<LoopRV> loop_rvs = sch->GetLoops(block_rv);
+    int num_fuse = std::min(i_multi_child, i_spatial_loop + 1);
+    loop_rv = sch->Fuse({loop_rvs.begin(), loop_rvs.begin() + num_fuse});
+  }
+  int64_t extent = -1;
+  if (const int64_t* e = GetLoopIntExtent(sch->Get(loop_rv).get())) {
+    extent = *e;
+  } else {
+    extent = std::numeric_limits<int64_t>::max();
+  }
+  if (extent <= max_threadblocks * max_threads_per_block) {
+    ExprRV factor = get_factor(std::min(extent, max_threads_per_block));
+    Array<LoopRV> splits = sch->Split(loop_rv, {NullOpt, factor});
+    ICHECK_EQ(splits.size(), 2);
+    sch->Bind(splits[0], "blockIdx.x");
+    sch->Bind(splits[1], "threadIdx.x");
+  } else {
+    Array<LoopRV> splits = sch->Split(loop_rv, {NullOpt,
+                                                Integer(max_threadblocks),  //
+                                                Integer(max_threads_per_block)});
+    ICHECK_EQ(splits.size(), 3);
+    sch->Reorder({splits[1], splits[2], splits[0]});
+    sch->Bind(splits[1], "blockIdx.x");
+    sch->Bind(splits[2], "threadIdx.x");
+  }
+}
+
+std::function<tir::ExprRV(int64_t)> MakeFactorSampler(tir::Schedule sch,
+                                                      Array<Integer> thread_extents) {
+  return [sch = std::move(sch),
+          thread_extents = std::move(thread_extents)](int64_t max_extent) -> tir::ExprRV {
+    Array<Integer> extents;
+    extents.reserve(thread_extents.size());
+    for (const Integer extent : thread_extents) {
+      if (extent->value <= max_extent) {
+        extents.push_back(extent);
+      }
+    }
+    int n = extents.size();
+    if (n == 0) {
+      return Integer(max_extent);
+    }
+    if (n == 1) {
+      return Integer(extents[0]);
+    }
+    Array<FloatImm> probs(n, FloatImm(DataType::Float(64), 1.0 / n));
+    return sch->SampleCategorical(extents, probs);
+  };
+}
+
+class AutoBindNode : public ScheduleRuleNode {
+ public:
+  // Inherited from ScheduleRuleNode
+  void InitializeWithTuneContext(const TuneContext& context) final {
+    CHECK(context->target.defined()) << "ValueError: target is not defined";
+    Optional<Integer> max_threads_per_block =
+        context->target.value()->GetAttr<Integer>("max_threads_per_block");
+    CHECK(max_threads_per_block.defined())
+        << "ValueError: missing attribute `max_threads_per_block` in the target";
+    this->max_threads_per_block_ = max_threads_per_block.value();
+  }
+
+  // Inherited from ScheduleRuleNode
+  Array<tir::Schedule> Apply(const tir::Schedule& sch, const tir::BlockRV& block_rv) final;
+
+ public:
+  /*! \brief The max number of threads per block from Target */
+  int64_t max_threads_per_block_ = -1;
+  /*! \brief The max number of threadblocks in the cuda device */
+  int64_t max_threadblocks_ = -1;
+  /*! \brief thread_extents Candidates of thread axis extent. */
+  Array<Integer> thread_extents_;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    // `max_threads_per_block_` is not visited
+    // `max_threadblocks_` is not visited
+    // `thread_extents_` is not visited
+  }
+
+  static constexpr const char* _type_key = "meta_schedule.AutoBind";
+  TVM_DECLARE_FINAL_OBJECT_INFO(AutoBindNode, ScheduleRuleNode);
+};
+
+Array<tir::Schedule> AutoBindNode::Apply(const tir::Schedule& sch, const tir::BlockRV& block_rv) {
+  ICHECK_NE(this->max_threads_per_block_, -1);
+  auto get_factor = MakeFactorSampler(sch, this->thread_extents_);
+  BindBlockThreadIdx(sch, block_rv, max_threadblocks_, max_threads_per_block_, get_factor);
+  return {sch};
+}
+
+ScheduleRule ScheduleRule::AutoBind(int max_threadblocks, Array<Integer> thread_extents) {
+  ObjectPtr<AutoBindNode> n = make_object<AutoBindNode>();
+  n->max_threadblocks_ = max_threadblocks;
+  n->max_threads_per_block_ = -1;
+  n->thread_extents_ = std::move(thread_extents);
+  return ScheduleRule(n);
+}
+
+TVM_REGISTER_NODE_TYPE(AutoBindNode);
+TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRuleAutoBind").set_body_typed(ScheduleRule::AutoBind);
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/meta_schedule/schedule_rule/auto_bind.h b/src/meta_schedule/schedule_rule/auto_bind.h
new file mode 100644
index 0000000000000..b397d2015c19a
--- /dev/null
+++ b/src/meta_schedule/schedule_rule/auto_bind.h
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_META_SCHEDULE_SCHEDULE_RULE_AUTO_BIND_H_
+#define TVM_META_SCHEDULE_SCHEDULE_RULE_AUTO_BIND_H_
+
+#include "../utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+/*!
+ * \brief Bind the given block if it is not bound to blockIdx or threadIdx.
+ * \param sch The schedule.
+ * \param block The block to be bound.
+ * \param max_threadblocks The maximum number of threadblocks allowed.
+ * \param max_threads The maximum number of threads allowed.
+ * \param get_factor A function that returns the tiling factor.
+ */
+void BindBlockThreadIdx(const tir::Schedule& sch, const tir::BlockRV& block,
+                        int64_t max_threadblocks, int64_t max_threads_per_block,
+                        std::function<tir::ExprRV(int64_t max_extent)> get_factor);
+
+/*!
+ * \brief Given candidates of thread_extents, make a sampler that use `sch->SampleCategorical`
+ * to return a random thread extent.
+ * \param sch The schedule
+ * \param thread_extents The candidate thread extents.
+ * \return A sampler that returns a random thread extent.
+ */
+std::function<tir::ExprRV(int64_t max_extent)> MakeFactorSampler(tir::Schedule sch,
+                                                                 Array<Integer> thread_extents);
+
+}  // namespace meta_schedule
+}  // namespace tvm
+
+#endif  // TVM_META_SCHEDULE_SCHEDULE_RULE_AUTO_BIND_H_
diff --git a/src/meta_schedule/schedule_rule/winograd.cc b/src/meta_schedule/schedule_rule/winograd.cc
index d8aab3a3f757a..ceec080b00a9f 100644
--- a/src/meta_schedule/schedule_rule/winograd.cc
+++ b/src/meta_schedule/schedule_rule/winograd.cc
@@ -17,9 +17,12 @@
  * under the License.
  */
 #include "../utils.h"
+#include "./auto_bind.h"
 
 namespace tvm {
-namespace tir {
+namespace meta_schedule {
+
+using namespace tvm::tir;
 
 TVM_REGISTER_GLOBAL("meta_schedule.compute_inline")
     .set_body_typed([](Schedule sch, BlockRV block) -> Array<Schedule> {
@@ -63,7 +66,7 @@ inline LoopRV ScheduleDataPack(Schedule sch, BlockRV block) {
   return t1[1];
 }
 
-TVM_REGISTER_GLOBAL("meta_schedule.winograd_inverse")
+TVM_REGISTER_GLOBAL("meta_schedule.winograd_inverse.llvm")
     .set_body_typed([](Schedule sch, BlockRV block) -> Array<Schedule> {
       ScheduleDataPack(sch, block);
       return {sch};
@@ -81,6 +84,16 @@ TVM_REGISTER_GLOBAL("meta_schedule.winograd_data_pack.llvm")
       return {sch};
     });
 
+TVM_REGISTER_GLOBAL("meta_schedule.winograd_inverse.cuda")
+    .set_body_typed([](Schedule sch, BlockRV block) -> Array<Schedule> {
+      ScheduleDataPack(sch, block);
+      int64_t max_threadblocks = 256;
+      int64_t max_threads_per_block = 1024;
+      auto get_factor = MakeFactorSampler(sch, {32, 64, 128, 256, 512, 1024});
+      BindBlockThreadIdx(sch, block, max_threadblocks, max_threads_per_block, get_factor);
+      return {sch};
+    });
+
 TVM_REGISTER_GLOBAL("meta_schedule.winograd_data_pack.cuda")
     .set_body_typed([](Schedule sch, BlockRV data_pack) -> Array<Schedule> {
       BlockRV input_tile = GetOnlyProducer(sch, data_pack);
@@ -89,8 +102,12 @@ TVM_REGISTER_GLOBAL("meta_schedule.winograd_data_pack.cuda")
       sch->ComputeAt(input_tile, /*loop_rv=*/loop, /*preserve_unit_loops=*/true);
       sch->SetScope(input_tile, /*buffer_index=*/0, /*storage_scope=*/"local");
       sch->ComputeInline(data_pad);
+      int64_t max_threadblocks = 256;
+      int64_t max_threads_per_block = 1024;
+      auto get_factor = MakeFactorSampler(sch, {32, 64, 128, 256, 512, 1024});
+      BindBlockThreadIdx(sch, data_pack, max_threadblocks, max_threads_per_block, get_factor);
       return {sch};
     });
 
-}  // namespace tir
+}  // namespace meta_schedule
 }  // namespace tvm
diff --git a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py
index afe6548d6fe39..328f98e7f0cb0 100644
--- a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py
+++ b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py
@@ -44,6 +44,25 @@ def input_tile_data_pad(sch: Schedule):
         b127 = sch.get_block(name="data_pad")
         sch.compute_inline(block=b127)
 
+        b3 = sch.get_block(name="data_pack")
+        l25, l26, l27, l28, _, _, _, _ = sch.get_loops(block=b3)
+        l33 = sch.fuse(l25, l26, l27, l28)
+        v34 = sch.sample_categorical(
+            candidates=[32, 64, 128, 256, 512, 1024],
+            probs=[
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+            ],
+            decision=2,
+        )
+        l35, l36 = sch.split(loop=l33, factors=[None, v34])
+        sch.bind(loop=l35, thread_axis="blockIdx.x")
+        sch.bind(loop=l36, thread_axis="threadIdx.x")
+
     def data_pack(sch: Schedule):
         b16 = sch.get_block(name="data_pack")
         l17, l18, l19, l20, l21, l22 = sch.get_loops(block=b16)
@@ -74,6 +93,16 @@ def bgemm(sch: Schedule):
             ann_key="meta_schedule.tiling_structure",
             ann_val="SSSRRSRS",
         )
+        sch.annotate(
+            block_or_loop=b31,
+            ann_key="meta_schedule.thread_extent_low_inclusive",
+            ann_val=32,
+        )
+        sch.annotate(
+            block_or_loop=b31,
+            ann_key="meta_schedule.thread_extent_high_inclusive",
+            ann_val=1024,
+        )
         b32 = sch.cache_write(block=b31, write_buffer_index=0, storage_scope="local")
         b31, b32 = b32, b31
         l33, l34, l35, l36, l37 = sch.get_loops(block=b32)
@@ -185,6 +214,57 @@ def inverse(sch: Schedule):
         sch.unroll(loop=l6)
         sch.unroll(loop=l7)
         sch.reorder(l10, l14, l11, l15, l2, l3, l6, l7)
+        l59 = sch.fuse(l10, l14, l11, l15)
+        v60 = sch.sample_categorical(
+            candidates=[32, 64, 128, 256, 512, 1024],
+            probs=[
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+            ],
+            decision=2,
+        )
+        l61, l62 = sch.split(loop=l59, factors=[None, v60])
+        sch.bind(loop=l61, thread_axis="blockIdx.x")
+        sch.bind(loop=l62, thread_axis="threadIdx.x")
+
+    def conv2d(sch: Schedule):
+        b7 = sch.get_block(name="conv2d_winograd")
+        l141, l142, l143, l144 = sch.get_loops(block=b7)
+        l145 = sch.fuse(l141, l142, l143, l144)
+        v146 = sch.sample_categorical(
+            candidates=[32, 64, 128, 256, 512, 1024],
+            probs=[
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+            ],
+            decision=2,
+        )
+        l147, l148 = sch.split(loop=l145, factors=[None, v146])
+        sch.bind(loop=l147, thread_axis="blockIdx.x")
+        sch.bind(loop=l148, thread_axis="threadIdx.x")
+
+    def root_anno(sch: Schedule):
+        b8 = sch.get_block(name="root", func_name="main")
+        v140 = sch.sample_categorical(
+            candidates=[0, 16, 64, 512, 1024],
+            probs=[
+                0.20000000000000001,
+                0.20000000000000001,
+                0.20000000000000001,
+                0.20000000000000001,
+                0.20000000000000001,
+            ],
+            decision=2,
+        )
+        sch.annotate(block_or_loop=b8, ann_key="meta_schedule.unroll_explicit", ann_val=v140)
 
     # pylint: enable=invalid-name
 
@@ -194,6 +274,8 @@ def inverse(sch: Schedule):
     input_tile_data_pad(sch)
     bgemm(sch)
     inverse(sch)
+    conv2d(sch)
+    root_anno(sch)
 
     return sch.mod
 
@@ -203,23 +285,27 @@ def test_conv2d_winograd_cuda():
     mod = IRModule({"main": mod})
     context = TuneContext(
         mod=mod,
-        target=Target("cuda"),
+        target=Target("nvidia/geforce-rtx-3090", host="llvm"),
         task_name="Custom Search Space Task",
         sch_rules=DefaultCUDA._sch_rules(),  # pylint: disable=protected-access
     )
+    for sch_rule in context.sch_rules:
+        sch_rule.initialize_with_tune_context(context)
     post_order_apply = PostOrderApply()
     post_order_apply.initialize_with_tune_context(context)
     (sch,) = post_order_apply.generate_design_space(mod)
     decisions = dict(
         zip(
-            [i for i in sch.trace.insts[:-2] if i.kind.name.startswith("Sample")],
+            [i for i in sch.trace.insts if i.kind.name.startswith("Sample")],
             [
                 # data_pack
                 [3, 3],
                 [64, 2],
+                2,
                 # inverse
                 [3, 3],
                 [2, 64],
+                2,
                 # bgemm
                 [1, 1, 1, 1, 6],
                 [1, 1, 1, 3, 2],
@@ -228,10 +314,14 @@ def test_conv2d_winograd_cuda():
                 [32, 1, 4],
                 1,
                 1,
+                # root anno
+                2,
+                # conv2d
+                2,
             ],
         )
     )
-    trace = Trace(sch.trace.insts[:-2], decisions=decisions)
+    trace = Trace(sch.trace.insts, decisions=decisions)
     sch = Schedule(mod=mod)
     trace.apply_to_schedule(sch, remove_postproc=False)
     answer = sch.mod
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py
new file mode 100644
index 0000000000000..a2e5dcbd1f0a8
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py
@@ -0,0 +1,86 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
+from tvm.meta_schedule import TuneContext
+from tvm.meta_schedule.mutator import MutateThreadBinding, Mutator
+from tvm.script import tir as T
+from tvm.target import Target
+from tvm.tir import Schedule
+
+# pylint: disable=invalid-name, no-member
+
+
+@T.prim_func
+def element_wise(var_A: T.handle, var_B: T.handle) -> None:
+    A = T.match_buffer(var_A, [512, 512], dtype="float32")
+    B = T.match_buffer(var_B, [512, 512], dtype="float32")
+    for i, j in T.grid(512, 512):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] + 1.0
+
+
+# pylint: enable=invalid-name, no-member
+
+
+def _sch() -> Schedule:
+    sch = Schedule(element_wise, debug_mask="all")
+    # pylint: disable=invalid-name
+    b0 = sch.get_block(name="C", func_name="main")
+    l1, l2 = sch.get_loops(block=b0)
+    l3 = sch.fuse(l1, l2)
+    v4 = sch.sample_categorical(
+        candidates=[32, 64, 128, 256, 512, 1024],
+        probs=[
+            0.16666666666666666,
+            0.16666666666666666,
+            0.16666666666666666,
+            0.16666666666666666,
+            0.16666666666666666,
+            0.16666666666666666,
+        ],
+        decision=3,
+    )
+    l5, l6 = sch.split(loop=l3, factors=[None, v4])
+    sch.bind(loop=l5, thread_axis="blockIdx.x")
+    sch.bind(loop=l6, thread_axis="threadIdx.x")
+    # pylint: enable=invalid-name
+    return sch
+
+
+def _make_mutator(target: Target) -> Mutator:
+    mutator = MutateThreadBinding()
+    mutator.initialize_with_tune_context(TuneContext(mod=element_wise, target=target))
+    return mutator
+
+
+def test_mutate_thread_binding():
+    mutator = _make_mutator(target=Target("cuda"))
+    sch = _sch()
+    results = set()
+    for _ in range(100):
+        trace = mutator.apply(sch.trace)
+        decision = trace.decisions[trace.insts[-4]]
+        results.add(decision)
+        if len(results) == 5:
+            break
+    assert len(results) == 5
+    assert results == {0, 1, 2, 4, 5}
+
+
+if __name__ == "__main__":
+    test_mutate_thread_binding()
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
new file mode 100644
index 0000000000000..bd0a24e8b642e
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
@@ -0,0 +1,75 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
+
+from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
+from tvm.meta_schedule.testing.schedule_rule import auto_bind
+from tvm.meta_schedule.testing.space_generation import check_trace
+from tvm.meta_schedule.tune_context import TuneContext
+from tvm.target import Target
+from tvm.script import tir as T
+
+
+@T.prim_func
+def element_wise(var_A: T.handle, var_B: T.handle) -> None:
+    A = T.match_buffer(var_A, [512, 512], dtype="float32")
+    B = T.match_buffer(var_B, [512, 512], dtype="float32")
+    for i, j in T.grid(512, 512):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] + 1.0
+
+
+def _create_context(mod, target, rule) -> TuneContext:
+    ctx = TuneContext(
+        mod=mod,
+        target=target,
+        space_generator=PostOrderApply(),
+        sch_rules=[rule],
+        task_name="test",
+    )
+    ctx.space_generator.initialize_with_tune_context(ctx)
+    for sch_rule in ctx.sch_rules:
+        sch_rule.initialize_with_tune_context(ctx)
+    return ctx
+
+
+def test_cuda_element_wise():
+    expected = [
+        [
+            'b0 = sch.get_block(name="C", func_name="main")',
+            "l1, l2 = sch.get_loops(block=b0)",
+            "l3 = sch.fuse(l1, l2)",
+            "v4 = sch.sample_categorical(candidates=[32, 64, 128, 256, 512, 1024], probs=[0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666])",
+            "l5, l6 = sch.split(loop=l3, factors=[None, v4])",
+            'sch.bind(loop=l5, thread_axis="blockIdx.x")',
+            'sch.bind(loop=l6, thread_axis="threadIdx.x")',
+        ]
+    ]
+    target = Target("nvidia/geforce-rtx-3080", host="llvm")
+    ctx = _create_context(
+        element_wise,
+        target=target,
+        rule=auto_bind(target=target),
+    )
+    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
+    assert len(spaces) == 1
+    check_trace(spaces, expected)
+
+
+if __name__ == "__main__":
+    test_cuda_element_wise()

From d0999bbd3b40b9466cc3b5c01f2b4b7fb09b478d Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 20 May 2022 21:33:55 -0500
Subject: [PATCH 54/59] [FFI] Renamed __VisitAttrs__ and __fvisit__ to
 non-reserved names (#11392)

All names beginning with two underscores are reserved for the
compiler, even if they occur inside a class or namespace.
---
 include/tvm/ir/attrs.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/tvm/ir/attrs.h b/include/tvm/ir/attrs.h
index 9a24687149629..d2eda659a5d1e 100644
--- a/include/tvm/ir/attrs.h
+++ b/include/tvm/ir/attrs.h
@@ -67,13 +67,13 @@ namespace tvm {
   static constexpr const char* _type_key = TypeKey;              \
   TVM_DECLARE_FINAL_OBJECT_INFO(ClassName, ::tvm::BaseAttrsNode) \
   template <typename FVisit>                                     \
-  void __VisitAttrs__(FVisit& __fvisit__)  // NOLINT(*)
+  void _tvm_VisitAttrs(FVisit& _tvm_fvisit)  // NOLINT(*)
 
 /*!
  * \brief Declare an attribute field.
  * \param FieldName The field name.
  */
-#define TVM_ATTR_FIELD(FieldName) __fvisit__(#FieldName, &FieldName)
+#define TVM_ATTR_FIELD(FieldName) _tvm_fvisit(#FieldName, &FieldName)
 
 /*!
  * \brief Create a NodeRef type that represents null.
@@ -835,12 +835,12 @@ class AttrsNode : public BaseAttrsNode {
  public:
   void VisitAttrs(AttrVisitor* v) {
     ::tvm::detail::AttrNormalVisitor vis(v);
-    self()->__VisitAttrs__(vis);
+    self()->_tvm_VisitAttrs(vis);
   }
 
   void VisitNonDefaultAttrs(AttrVisitor* v) {
     ::tvm::detail::AttrNonDefaultVisitor vis(v);
-    self()->__VisitAttrs__(vis);
+    self()->_tvm_VisitAttrs(vis);
   }
 
   void InitByPackedArgs(const runtime::TVMArgs& args, bool allow_unknown) final {
@@ -861,7 +861,7 @@ class AttrsNode : public BaseAttrsNode {
         return false;
       };
       auto vis = ::tvm::detail::CreateInitVisitor(DerivedType::_type_key, ffind);
-      self()->__VisitAttrs__(vis);
+      self()->_tvm_VisitAttrs(vis);
       hit_count = vis.hit_count_;
     } else {
       // construct a map then do lookup.
@@ -879,7 +879,7 @@ class AttrsNode : public BaseAttrsNode {
         return false;
       };
       auto vis = ::tvm::detail::CreateInitVisitor(DerivedType::_type_key, ffind);
-      self()->__VisitAttrs__(vis);
+      self()->_tvm_VisitAttrs(vis);
       hit_count = vis.hit_count_;
     }
     // error handling, slow path
@@ -887,7 +887,7 @@ class AttrsNode : public BaseAttrsNode {
       for (int i = 0; i < args.size(); i += 2) {
         ::tvm::detail::AttrExistVisitor visitor;
         visitor.key_ = args[i].operator std::string();
-        self()->__VisitAttrs__(visitor);
+        self()->_tvm_VisitAttrs(visitor);
         if (!visitor.exist_) {
           std::ostringstream os;
           os << DerivedType::_type_key << ": does not have field \'" << visitor.key_
@@ -903,18 +903,18 @@ class AttrsNode : public BaseAttrsNode {
   bool SEqualReduce(const DerivedType* other, SEqualReducer equal) const {
     DerivedType* pself = self();
     ::tvm::detail::AttrsSEqualVisitor visitor(pself, other, equal);
-    self()->__VisitAttrs__(visitor);
+    self()->_tvm_VisitAttrs(visitor);
     return visitor.result_;
   }
 
   void SHashReduce(SHashReducer hash_reducer) const {
     ::tvm::detail::AttrsSHashVisitor visitor(hash_reducer);
-    self()->__VisitAttrs__(visitor);
+    self()->_tvm_VisitAttrs(visitor);
   }
 
   Array<AttrFieldInfo> ListFieldInfo() const final {
     ::tvm::detail::AttrDocVisitor visitor;
-    self()->__VisitAttrs__(visitor);
+    self()->_tvm_VisitAttrs(visitor);
     return visitor.fields_;
   }
 

From fa5460242e31cea3df7db8efe42da57196eba25e Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sat, 21 May 2022 07:21:15 -0700
Subject: [PATCH 55/59] [MetaSchedule] Enhance CPU auto vectorization (#11404)

---
 .../rewrite_parallel_vectorize_unroll.cc      |  2 +-
 ...tproc_rewrite_parallel_vectorize_unroll.py | 91 ++++++++++++++++++-
 2 files changed, 89 insertions(+), 4 deletions(-)

diff --git a/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc b/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc
index 69e8dfb858bce..001c97645b6e9 100644
--- a/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc
+++ b/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc
@@ -207,7 +207,7 @@ void AdjustParallelVectorize(const Schedule& sch, const BlockRV& block_rv,
         continue;
       } else if (prev_used_iter == -1) {
         // the stride of last axis is not 1 means the memory access is not contiguous
-        if (strides[i] != 1) {
+        if (strides[i] != 1 && fusible != 0) {
           break;
         }
         fusible++;
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_parallel_vectorize_unroll.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_parallel_vectorize_unroll.py
index 9988e874b81da..f9b71bfdb654a 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_parallel_vectorize_unroll.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_parallel_vectorize_unroll.py
@@ -16,9 +16,8 @@
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 import tvm
-from tvm.script import tir as T
-
 from tvm.meta_schedule.postproc import RewriteParallelVectorizeUnroll
+from tvm.script import tir as T
 from tvm.tir.schedule import Schedule
 
 # pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument,not-callable,misplaced-comparison-constant
@@ -70,6 +69,85 @@ def Move_PUV0(a: T.handle, b: T.handle) -> None:
                         T.writes([B[vi, vj, vk]])
                         B[vi, vj, vk] = A[vi, vj, vk]
 
+
+@tvm.script.ir_module
+class Fused_NN_Dense:
+    @T.prim_func
+    def main(placeholder: T.Buffer[(64, 768), "float32"], placeholder_1: T.Buffer[(768, 768), "float32"], T_matmul_NT: T.Buffer[(64, 768), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_placeholders": [1]})
+        # body
+        # with T.block("root")
+        for i0, i1, i2 in T.grid(64, 768, 768):
+            with T.block("T_matmul_NT"):
+                i, j, k = T.axis.remap("SSR", [i0, i1, i2])
+                T.reads(placeholder[i, k], placeholder_1[j, k])
+                T.writes(T_matmul_NT[i, j])
+                with T.init():
+                    T_matmul_NT[i, j] = T.float32(0)
+                T_matmul_NT[i, j] = T_matmul_NT[i, j] + placeholder[i, k] * placeholder_1[j, k]
+
+@T.prim_func
+def before_matmul_vectorize(
+    placeholder: T.Buffer[(64, 768), "float32"],
+    placeholder_1: T.Buffer[(768, 768), "float32"],
+    T_matmul_NT: T.Buffer[(64, 768), "float32"],
+) -> None:
+    T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_placeholders": [1]})
+    with T.block("root"):
+        T.reads()
+        T.writes()
+        T.block_attr({"meta_schedule.vectorize":64})
+        T_matmul_NT_global = T.alloc_buffer([64, 768], dtype="float32")
+        for i0_0, i1_0, i0_1, i1_1 in T.grid(1, 16, 1, 3):
+            for i2_0, i0_2, i1_2, i2_1, i0_3, i1_3 in T.grid(48, 8, 1, 16, 8, 16):
+                with T.block("T_matmul_NT"):
+                    i = T.axis.spatial(64, i0_2 * 8 + i0_3)
+                    j = T.axis.spatial(768, i1_0 * 48 + i1_1 * 16 + i1_3)
+                    k = T.axis.reduce(768, i2_0 * 16 + i2_1)
+                    T.reads(placeholder[i, k], placeholder_1[j, k])
+                    T.writes(T_matmul_NT_global[i, j])
+                    with T.init():
+                        T_matmul_NT_global[i, j] = T.float32(0)
+                    T_matmul_NT_global[i, j] = T_matmul_NT_global[i, j] + placeholder[i, k] * placeholder_1[j, k]
+            for ax0, ax1 in T.grid(64, 16):
+                with T.block("T_matmul_NT_global"):
+                    v0 = T.axis.spatial(64, ax0)
+                    v1 = T.axis.spatial(768, i1_0 * 48 + i1_1 * 16 + ax1)
+                    T.reads(T_matmul_NT_global[v0, v1])
+                    T.writes(T_matmul_NT[v0, v1])
+                    T_matmul_NT[v0, v1] = T_matmul_NT_global[v0, v1]
+
+@T.prim_func
+def after_matmul_vectorize(
+    placeholder: T.Buffer[(64, 768), "float32"],
+    placeholder_1: T.Buffer[(768, 768), "float32"],
+    T_matmul_NT: T.Buffer[(64, 768), "float32"],
+) -> None:
+    T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_placeholders": [1]})
+    T_matmul_NT_global = T.alloc_buffer([64, 768], dtype="float32")
+    for i0_0, i1_0, i0_1, i1_1 in T.grid(1, 16, 1, 3):
+        for i2_0, i0_2, i1_2, i2_1, i0_3 in T.grid(48, 8, 1, 16, 8):
+            for i1_3_fused in T.vectorized(16):
+                with T.block("T_matmul_NT"):
+                    i = T.axis.spatial(64, i0_2 * 8 + i0_3)
+                    j = T.axis.spatial(768, i1_0 * 48 + i1_1 * 16 + i1_3_fused)
+                    k = T.axis.reduce(768, i2_0 * 16 + i2_1)
+                    T.reads(placeholder[i, k], placeholder_1[j, k])
+                    T.writes(T_matmul_NT_global[i, j])
+                    with T.init():
+                        T_matmul_NT_global[i, j] = T.float32(0)
+                    T_matmul_NT_global[i, j] = T_matmul_NT_global[i, j] + placeholder[i, k] * placeholder_1[j, k]
+        for ax0 in T.serial(64):
+            for ax1_fused in T.vectorized(16):
+                with T.block("T_matmul_NT_global"):
+                    v0 = T.axis.spatial(64, ax0)
+                    v1 = T.axis.spatial(768, i1_0 * 48 + i1_1 * 16 + ax1_fused)
+                    T.reads(T_matmul_NT_global[v0, v1])
+                    T.writes(T_matmul_NT[v0, v1])
+                    T_matmul_NT[v0, v1] = T_matmul_NT_global[v0, v1]
+
+
 # fmt: on
 # pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument,not-callable
 
@@ -78,10 +156,17 @@ def test_meta_schedule_postproc_rewrite_parallel_unroll_vectorize():
     postproc = RewriteParallelVectorizeUnroll()
     sch = Schedule(Move_PUV)
     assert postproc.apply(sch)
-    print(sch.mod["main"].script())
     mod = tvm.tir.transform.Simplify()(sch.mod)
     tvm.ir.assert_structural_equal(mod["main"], Move_PUV0)
 
 
+def test_vectorize_inner_loop():
+    sch = Schedule(before_matmul_vectorize)
+    rule = RewriteParallelVectorizeUnroll()
+    assert rule.apply(sch)
+    tvm.ir.assert_structural_equal(sch.mod["main"], after_matmul_vectorize)
+
+
 if __name__ == "__main__":
     test_meta_schedule_postproc_rewrite_parallel_unroll_vectorize()
+    test_vectorize_inner_loop()

From 83c9ee1a26ff66b9300615a50b4b400ff83cb06d Mon Sep 17 00:00:00 2001
From: Christoph Gerum <christoph.gerum@uni-tuebingen.de>
Date: Mon, 23 May 2022 12:12:46 +0200
Subject: [PATCH 56/59] Fix int8 cuda kernels on older SM versions (#11389)

* Fix int8 cuda kernels on older SM versions

* Update target.py

* Simplify initialiasation of do_tensorize

* Simplify initialization of do_tensorize dense

* Simplify initialization of do_tensorize in group_conv_nchw

* Fix tensorize for conv2d_int8 as well.

* Try to make linter happy

* make linter happy

* Fix wrong commit to auto_scheduler
---
 python/tvm/target/target.py               | 4 ++++
 python/tvm/topi/cuda/batch_matmul.py      | 7 ++-----
 python/tvm/topi/cuda/conv2d_int8.py       | 7 +++----
 python/tvm/topi/cuda/dense.py             | 6 ++----
 python/tvm/topi/cuda/group_conv2d_nchw.py | 4 +---
 5 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 101980941fb04..a37727e926c02 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -191,6 +191,10 @@ def mattr(self):
     def supports_integer_dot_product(self):
         if self.attrs.get("supports_integer_dot_product", []):
             return bool(self.attrs["supports_integer_dot_product"])
+        if self.kind == "cuda":
+            sm_version = int(self.arch.split("_")[1])
+            if sm_version >= 61:
+                return True
         return False
 
     @property
diff --git a/python/tvm/topi/cuda/batch_matmul.py b/python/tvm/topi/cuda/batch_matmul.py
index ff625d6d714ce..4e476094f2d91 100644
--- a/python/tvm/topi/cuda/batch_matmul.py
+++ b/python/tvm/topi/cuda/batch_matmul.py
@@ -22,7 +22,7 @@
 from tvm.contrib import cublas
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 from .. import nn, generic
-from ..utils import traverse_inline, get_const_tuple, get_max_power2_factor, is_target
+from ..utils import traverse_inline, get_const_tuple, get_max_power2_factor
 from .tensor_intrin import dp4a
 
 
@@ -367,10 +367,7 @@ def _schedule_batch_matmul_int8(cfg, s, output):
     # dp4a tensorize
 
     target = tvm.target.Target.current(allow_none=False)
-    do_tensorize = True
-
-    if is_target(["vulkan", "rocm"]):
-        do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
+    do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
 
     if do_tensorize:
         dtypes = (input_x.dtype, input_y.dtype)
diff --git a/python/tvm/topi/cuda/conv2d_int8.py b/python/tvm/topi/cuda/conv2d_int8.py
index a8b21a1deca04..0edd64e0e3794 100644
--- a/python/tvm/topi/cuda/conv2d_int8.py
+++ b/python/tvm/topi/cuda/conv2d_int8.py
@@ -26,7 +26,7 @@
 from ..nn.pad import pad
 from ..nn.conv2d import unpack_NCHWc_to_nchw
 from ..nn.utils import get_pad_tuple
-from ..utils import get_const_tuple, traverse_inline, is_target
+from ..utils import get_const_tuple, traverse_inline
 
 
 def conv2d_nchw_int8(data, kernel, strides, padding, dilation, out_dtype="int32"):
@@ -311,9 +311,8 @@ def _schedule_conv2d_NCHWc_int8(cfg, s, output):
 
     _, rc_block = s[conv].split(rc_block, factor=4)
     target = tvm.target.Target.current(allow_none=False)
-    do_tensorize = True
-    if is_target(["vulkan", "rocm"]):
-        do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
+    do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
+
     if do_tensorize:
         dtypes = (pad_data.dtype, packed_kernel.dtype)
         s[conv].tensorize(rc_block, dp4a("shared", "shared", "local", dtypes))
diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py
index 859f6c1097c64..32b80db6d5849 100644
--- a/python/tvm/topi/cuda/dense.py
+++ b/python/tvm/topi/cuda/dense.py
@@ -24,7 +24,7 @@
 from .tensor_intrin import dp4a
 from .. import tag
 from .. import generic
-from ..utils import traverse_inline, get_const_tuple, is_target
+from ..utils import traverse_inline, get_const_tuple
 
 logger = logging.getLogger("topi")
 
@@ -172,9 +172,7 @@ def _schedule_dense_int8(cfg, s, output):
     ko, ki = s[CC].split(ko, factor=4)
     ko, kt = cfg["tile_k"].apply(s, CC, ko)
     target = tvm.target.Target.current(allow_none=False)
-    do_tensorize = True
-    if is_target(["vulkan", "rocm"]):
-        do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
+    do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
 
     if do_tensorize:
         dtypes = (data.dtype, weight.dtype)
diff --git a/python/tvm/topi/cuda/group_conv2d_nchw.py b/python/tvm/topi/cuda/group_conv2d_nchw.py
index f786b0d8d6477..b48ea3a5f8be2 100644
--- a/python/tvm/topi/cuda/group_conv2d_nchw.py
+++ b/python/tvm/topi/cuda/group_conv2d_nchw.py
@@ -507,9 +507,7 @@ def _schedule_group_conv2d_NCHWc_int8(cfg, s, output):
     s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x, c, rc_block)
     _, rc_block = s[conv].split(rc_block, factor=4)
     target = tvm.target.Target.current(allow_none=False)
-    do_tensorize = True
-    if "vulkan" in target.keys:
-        do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
+    do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
     if do_tensorize:
         dtypes = (pad_data.dtype, packed_kernel.dtype)
         s[conv].tensorize(rc_block, dp4a("shared", "shared", "local", dtypes))

From df632baa78a4f550759d62fbc252039bfd9a64c3 Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Mon, 23 May 2022 11:14:00 +0100
Subject: [PATCH 57/59] [Tests] Replace the Relay interpreter with the VM in
 the op tests (#11386)

---
 python/tvm/relay/testing/__init__.py          |   9 +-
 .../relay/dyn/test_dynamic_op_level10.py      |  54 ++--
 .../relay/dyn/test_dynamic_op_level2.py       |  46 ++--
 .../relay/dyn/test_dynamic_op_level3.py       |  71 +++---
 .../relay/dyn/test_dynamic_op_level5.py       |  15 +-
 .../relay/dyn/test_dynamic_op_level6.py       |  27 +-
 tests/python/relay/test_op_grad_level1.py     |  38 +--
 tests/python/relay/test_op_grad_level10.py    |  37 ++-
 tests/python/relay/test_op_grad_level2.py     | 151 +++++++----
 tests/python/relay/test_op_grad_level3.py     |  74 +++---
 tests/python/relay/test_op_grad_level4.py     |  47 ++--
 tests/python/relay/test_op_level1.py          |  39 ++-
 tests/python/relay/test_op_level10.py         | 236 ++++++++++--------
 tests/python/relay/test_op_level2.py          |  32 +--
 tests/python/relay/test_op_level3.py          |  69 +++--
 tests/python/relay/test_op_level4.py          |  48 ++--
 tests/python/relay/test_op_level5.py          | 166 +++++-------
 tests/python/relay/test_op_level6.py          |  55 ++--
 18 files changed, 645 insertions(+), 569 deletions(-)

diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py
index 909712511061f..2399a474de88c 100644
--- a/python/tvm/relay/testing/__init__.py
+++ b/python/tvm/relay/testing/__init__.py
@@ -82,6 +82,7 @@ def check_grad(
     mean=0,
     mode="higher_order",
     target_devices=None,
+    executor_kind="debug",
 ):
     """Perform numerical gradient checking given a relay function.
 
@@ -146,8 +147,12 @@ def check_grad(
     for target, dev in target_devices:
         # Eval the backward and forward functions
         # TODO(mbs): Evaluate a pair of functions so can share preparation between them.
-        bwd_func_compiled = relay.create_executor(device=dev, target=target).evaluate(bwd_func)
-        fwd_func_compiled = relay.create_executor(device=dev, target=target).evaluate(fwd_func)
+        bwd_func_compiled = relay.create_executor(
+            executor_kind, device=dev, target=target
+        ).evaluate(bwd_func)
+        fwd_func_compiled = relay.create_executor(
+            executor_kind, device=dev, target=target
+        ).evaluate(fwd_func)
 
         # Get analytic gradients.
         _, grads = bwd_func_compiled(*inputs)
diff --git a/tests/python/relay/dyn/test_dynamic_op_level10.py b/tests/python/relay/dyn/test_dynamic_op_level10.py
index d34b80303b290..5a31977b45068 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level10.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level10.py
@@ -27,9 +27,11 @@
 import random
 import tvm.testing
 
+executor_kind = tvm.testing.parameter("debug", "vm")
+
 
 @tvm.testing.uses_gpu
-def test_broadcast_to():
+def test_broadcast_to(executor_kind):
     def verify_more_dynamic_broadcast_to(x_shape, out_shape):
         rank = len(out_shape)
         dtype = "float32"
@@ -45,12 +47,13 @@ def verify_more_dynamic_broadcast_to(x_shape, out_shape):
         x = np.random.uniform(size=np.prod(x_shape)).astype(dtype)
         ref_res = np.broadcast_to(np.reshape(x, x_shape), out_shape)
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["vm", "debug"]:
-                mod = tvm.ir.IRModule.from_expr(func)
-                op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate(
-                    func
-                )(x, np.array(x_shape).astype(shape_type), np.array(out_shape).astype(shape_type))
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+            mod = tvm.ir.IRModule.from_expr(func)
+            op_res = relay.create_executor(
+                executor_kind, mod=mod, device=dev, target=target
+            ).evaluate(func)(
+                x, np.array(x_shape).astype(shape_type), np.array(out_shape).astype(shape_type)
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
     verify_more_dynamic_broadcast_to((4, 3), (3, 4, 3))
 
@@ -70,12 +73,11 @@ def verify_broadcast_to(x_shape, out_shape):
         x = np.random.uniform(size=x_shape).astype(dtype)
         ref_res = np.broadcast_to(x, out_shape)
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["vm", "debug"]:
-                mod = tvm.ir.IRModule.from_expr(func)
-                op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate(
-                    func
-                )(x, np.array(out_shape).astype(shape_type))
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+            mod = tvm.ir.IRModule.from_expr(func)
+            op_res = relay.create_executor(
+                executor_kind, mod=mod, device=dev, target=target
+            ).evaluate(func)(x, np.array(out_shape).astype(shape_type))
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
     verify_broadcast_to((1,), (1, 1, 1))
     verify_broadcast_to((1, 1), (4, 1, 1))
@@ -83,7 +85,7 @@ def verify_broadcast_to(x_shape, out_shape):
 
 
 @tvm.testing.uses_gpu
-def test_dyn_broadcast_to():
+def test_dyn_broadcast_to(executor_kind):
     dtype = "uint8"
     rank = 3
     shape_type = "int64"
@@ -101,16 +103,15 @@ def test_dyn_broadcast_to():
     dyn_shape = (1,) * rank
     ref_res = np.broadcast_to(x, dyn_shape)
     for target, dev in tvm.testing.enabled_targets():
-        for kind in ["vm", "debug"]:
-            mod = tvm.ir.IRModule.from_expr(func)
-            op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate(func)(
-                x, np.array(dyn_shape).astype(shape_type)
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+        mod = tvm.ir.IRModule.from_expr(func)
+        op_res = relay.create_executor(executor_kind, mod=mod, device=dev, target=target).evaluate(
+            func
+        )(x, np.array(dyn_shape).astype(shape_type))
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 @tvm.testing.uses_gpu
-def test_dyn_one_hot():
+def test_dyn_one_hot(executor_kind):
     def _get_oshape(indices_shape, depth, axis):
         oshape = []
         true_axis = len(indices_shape) if axis == -1 else axis
@@ -135,12 +136,11 @@ def _verify(indices_shape, depth, on_value, off_value, axis, dtype):
         indices_np = np.random.randint(0, depth, size=indices_shape).astype("int32")
         out_np = tvm.topi.testing.one_hot(indices_np, on_value, off_value, depth, axis, dtype)
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["vm", "debug"]:
-                mod = tvm.ir.IRModule.from_expr(func)
-                out_relay = relay.create_executor(
-                    kind, mod=mod, device=dev, target=target
-                ).evaluate()(indices_np, np.array(depth).astype("int32"))
-                tvm.testing.assert_allclose(out_relay.numpy(), out_np)
+            mod = tvm.ir.IRModule.from_expr(func)
+            out_relay = relay.create_executor(
+                executor_kind, mod=mod, device=dev, target=target
+            ).evaluate()(indices_np, np.array(depth).astype("int32"))
+            tvm.testing.assert_allclose(out_relay.numpy(), out_np)
 
     _verify((3,), 3, 1, 0, -1, "int32")
     _verify((3,), 3, 1.0, 0.0, -1, "float32")
diff --git a/tests/python/relay/dyn/test_dynamic_op_level2.py b/tests/python/relay/dyn/test_dynamic_op_level2.py
index fd7ab70028067..a017762ce35db 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level2.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level2.py
@@ -27,9 +27,11 @@
 import tvm.topi.testing
 from tvm.relay.testing import run_infer_type
 
+executor_kind = tvm.testing.parameter("debug", "vm")
+
 
 @tvm.testing.uses_gpu
-def test_dyn_upsampling_run():
+def test_dyn_upsampling_run(executor_kind):
     def verify_upsampling(dshape, scale_h, scale_w, layout, method, align_corners=False):
 
         if layout == "NCHW":
@@ -58,12 +60,13 @@ def verify_upsampling(dshape, scale_h, scale_w, layout, method, align_corners=Fa
         func = relay.Function([x, scale_h_var, scale_w_var], z)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["vm", "debug"]:
-                mod = tvm.ir.IRModule.from_expr(func)
-                op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()(
-                    x_data, np.array(scale_h).astype("float32"), np.array(scale_w).astype("float32")
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4, atol=1e-6)
+            mod = tvm.ir.IRModule.from_expr(func)
+            op_res = relay.create_executor(
+                executor_kind, mod=mod, device=dev, target=target
+            ).evaluate()(
+                x_data, np.array(scale_h).astype("float32"), np.array(scale_w).astype("float32")
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4, atol=1e-6)
 
     verify_upsampling((1, 16, 32, 32), 3, 2.0, "NCHW", "nearest_neighbor")
     verify_upsampling((1, 16, 32, 32), 5, 2.0, "NCHW", "bilinear", True)
@@ -85,7 +88,7 @@ def test_dyn_upsampling_infer_type_const():
 
 
 @tvm.testing.uses_gpu
-def test_dyn_upsampling3d_run():
+def test_dyn_upsampling3d_run(executor_kind):
     def verify_upsampling3d(
         dshape, scale_d, scale_h, scale_w, layout, method, coord_trans="asymmetric"
     ):
@@ -124,15 +127,16 @@ def verify_upsampling3d(
         func = relay.Function([x, scale_d_var, scale_h_var, scale_w_var], z)
 
         for target, dev in enabled_targets():
-            for kind in ["vm", "debug"]:
-                mod = tvm.ir.IRModule.from_expr(func)
-                op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()(
-                    x_data,
-                    np.array(scale_d).astype("float32"),
-                    np.array(scale_h).astype("float32"),
-                    np.array(scale_w).astype("float32"),
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4, atol=1e-6)
+            mod = tvm.ir.IRModule.from_expr(func)
+            op_res = relay.create_executor(
+                executor_kind, mod=mod, device=dev, target=target
+            ).evaluate()(
+                x_data,
+                np.array(scale_d).astype("float32"),
+                np.array(scale_h).astype("float32"),
+                np.array(scale_w).astype("float32"),
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4, atol=1e-6)
 
     verify_upsampling3d((1, 1, 1, 1, 1), 2, 3, 4, "NCDHW", "nearest_neighbor")
     verify_upsampling3d((1, 8, 16, 16, 16), 2.0, 3.0, 4.0, "NCDHW", "nearest_neighbor")
@@ -163,7 +167,7 @@ def test_dyn_upsampling3d_infer_type_const():
 
 
 @tvm.testing.uses_gpu
-def test_dyn_pad():
+def test_dyn_pad(executor_kind):
     def verify_pad(dshape, pad_width, pad_val, dtype):
         x = relay.var("x", relay.TensorType(dshape, dtype))
         ndim = len(dshape)
@@ -178,7 +182,9 @@ def verify_pad(dshape, pad_width, pad_val, dtype):
         ref_res = np.pad(data, pad_width, "constant", constant_values=(((pad_val,) * 2),) * ndim)
         pad_width = np.array(pad_width).astype("int64")
 
-        verify_func(func, [data, pad_width, np.array(pad_val).astype(dtype)], ref_res)
+        verify_func(
+            executor_kind, func, [data, pad_width, np.array(pad_val).astype(dtype)], ref_res
+        )
 
     def verify_pad_default_fill(dshape, pad_width, dtype):
         x = relay.var("x", relay.TensorType(dshape, dtype))
@@ -193,7 +199,7 @@ def verify_pad_default_fill(dshape, pad_width, dtype):
         ref_res = np.pad(data, pad_width)
         pad_width = np.array(pad_width).astype("int64")
 
-        verify_func(func, [data, pad_width], ref_res)
+        verify_func(executor_kind, func, [data, pad_width], ref_res)
 
     verify_pad((4, 10, 7, 7), ((1, 1), (2, 2), (3, 3), (4, 4)), 2.0, "int32")
     verify_pad((2, 7), ((1, 4), (2, 2)), 4.0, "float64")
diff --git a/tests/python/relay/dyn/test_dynamic_op_level3.py b/tests/python/relay/dyn/test_dynamic_op_level3.py
index 0456401e8ad29..0e68cd7246ac9 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level3.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level3.py
@@ -23,24 +23,25 @@
 from tvm import relay, te
 from tvm.relay.testing import check_grad, run_infer_type
 
+executor_kind = tvm.testing.parameter("debug", "vm")
 
-def verify_func(func, data, ref_res, target_device=tvm.testing.enabled_targets()):
+
+def verify_func(executor_kind, func, data, ref_res, target_device=tvm.testing.enabled_targets()):
     assert isinstance(data, list)
     for target, dev in target_device:
-        for kind in ["vm", "debug"]:
-            mod = tvm.ir.IRModule.from_expr(func)
-            op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()(
-                *data
-            )
-            if isinstance(op_res, tvm.runtime.container.ADT):
-                assert len(op_res) == len(
-                    ref_res
-                ), "Outputs from TVM and Python implementation must be equal "
-                for op_result, ref_result in zip(op_res, ref_res):
-                    tvm.testing.assert_allclose(op_result.numpy(), ref_result, rtol=1e-5)
-            else:
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-            relay.backend.te_compiler.get().clear()
+        mod = tvm.ir.IRModule.from_expr(func)
+        op_res = relay.create_executor(
+            executor_kind, mod=mod, device=dev, target=target
+        ).evaluate()(*data)
+        if isinstance(op_res, tvm.runtime.container.ADT):
+            assert len(op_res) == len(
+                ref_res
+            ), "Outputs from TVM and Python implementation must be equal "
+            for op_result, ref_result in zip(op_res, ref_res):
+                tvm.testing.assert_allclose(op_result.numpy(), ref_result, rtol=1e-5)
+        else:
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+        relay.backend.te_compiler.get().clear()
 
 
 def check_on_vm(target, dev, args, expected_result, mod):
@@ -53,7 +54,7 @@ def check_on_vm(target, dev, args, expected_result, mod):
 
 
 @tvm.testing.uses_gpu
-def test_dyn_reshape():
+def test_dyn_reshape(executor_kind):
     def verify_reshape(shape, newshape, oshape):
         x = relay.var("x", relay.TensorType(shape, "float32"))
         y = relay.var("y", relay.TensorType((len(newshape),), "int64"))
@@ -69,7 +70,7 @@ def verify_reshape(shape, newshape, oshape):
             test_inputs=[x_data],
             eps=1e-3,
         )
-        verify_func(func, [x_data, np.array(newshape).astype("int64")], ref_res)
+        verify_func(executor_kind, func, [x_data, np.array(newshape).astype("int64")], ref_res)
 
     verify_reshape((2, 3, 4), (8, 3), (8, 3))
     verify_reshape((4, 7), (2, 7, 2), (2, 7, 2))
@@ -83,7 +84,7 @@ def verify_reshape(shape, newshape, oshape):
 
 
 @tvm.testing.uses_gpu
-def test_dyn_shape_reshape():
+def test_dyn_shape_reshape(executor_kind):
     def verify_reshape(shape, newshape, oshape):
         x = relay.var("x", relay.TensorType(shape, "float32"))
         y = relay.var("y", relay.TensorType(newshape, "float32"))
@@ -94,13 +95,13 @@ def verify_reshape(shape, newshape, oshape):
         y_data = np.random.uniform(low=-1, high=1, size=newshape).astype("float32")
         ref_res = np.reshape(x_data, oshape)
         check_grad(run_infer_type(func), inputs=[x_data, y_data], eps=1e-3)
-        verify_func(func, [x_data, y_data], ref_res)
+        verify_func(executor_kind, func, [x_data, y_data], ref_res)
 
     verify_reshape((2, 3, 4), (8, 3), (8, 3))
     verify_reshape((4, 7), (2, 7, 2), (2, 7, 2))
 
 
-def test_squeeze():
+def test_squeeze(executor_kind):
     def verify_squeeze(shape, dtype, axis):
         x = relay.var("x", relay.TensorType(shape, dtype))
         assert axis is not None
@@ -110,14 +111,14 @@ def verify_squeeze(shape, dtype, axis):
         func = relay.Function([x, axis], squeeze)
         x_data = np.random.random_sample(shape).astype(dtype)
         ref_res = np.squeeze(x_data, axis=np_axis)
-        verify_func(func, [x_data, np.array(np_axis).astype("int64")], ref_res)
+        verify_func(executor_kind, func, [x_data, np.array(np_axis).astype("int64")], ref_res)
 
     verify_squeeze((1, 3, 1), "float32", [0])
     verify_squeeze((1, 2, 1, 2, 1), "float32", [0, 2])
 
 
 @tvm.testing.uses_gpu
-def test_dyn_expand_dims():
+def test_dyn_expand_dims(executor_kind):
     def verify_expand_dims(
         dshape, dtype, oshape, axis, num_newaxis, target_device=tvm.testing.enabled_targets()
     ):
@@ -130,7 +131,7 @@ def verify_expand_dims(
         data_np = np.random.uniform(size=dshape).astype(dtype)
         axis_np = np.array(axis).astype("int64")
         ref_res = data_np.reshape(oshape)
-        verify_func(func, [data_np, axis_np], ref_res, target_device=target_device)
+        verify_func(executor_kind, func, [data_np, axis_np], ref_res, target_device=target_device)
 
     for dtype in ["float16", "float32"]:
         verify_expand_dims((2, 2), dtype, (2, 2, 1), 2, 1)
@@ -146,7 +147,7 @@ def verify_expand_dims(
 
 
 @tvm.testing.uses_gpu
-def test_dyn_tile():
+def test_dyn_tile(executor_kind):
     def verify_tile(dshape, reps):
         x = relay.var("x", relay.TensorType(dshape, "float32"))
         r = relay.var("reps", relay.TensorType((len(reps),), "float32"))
@@ -156,7 +157,7 @@ def verify_tile(dshape, reps):
         x_data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32")
         ref_res = np.tile(x_data, reps=reps)
         reps_data = np.array(reps).astype("float32")
-        verify_func(func, [x_data, np.array(reps).astype("float32")], ref_res)
+        verify_func(executor_kind, func, [x_data, np.array(reps).astype("float32")], ref_res)
 
     verify_tile((2, 3, 4), (3, 2, 1))
     verify_tile((2, 3, 4), (1, 2))
@@ -164,7 +165,7 @@ def verify_tile(dshape, reps):
 
 
 @tvm.testing.uses_gpu
-def test_dyn_zeros_ones():
+def test_dyn_zeros_ones(executor_kind):
     def verify_zeros_ones(shape, dtype):
         for op, ref in [(relay.zeros, np.zeros), (relay.ones, np.ones)]:
             rank = len(shape)
@@ -175,14 +176,16 @@ def verify_zeros_ones(shape, dtype):
 
             func = relay.Function([dyn_shape], y)
             ref_res = ref(shape, dtype)
-            verify_func(func, [np.array(shape).astype("int64")], ref_res.astype("int64"))
+            verify_func(
+                executor_kind, func, [np.array(shape).astype("int64")], ref_res.astype("int64")
+            )
 
     verify_zeros_ones((1, 3), "int64")
     verify_zeros_ones((8, 9, 1, 2), "float32")
 
 
 @tvm.testing.uses_gpu
-def test_dyn_full():
+def test_dyn_full(executor_kind):
     def verify_full(fill_value, src_shape, dtype):
         x = relay.var("x", relay.scalar_type(dtype))
         rank = len(src_shape)
@@ -192,7 +195,10 @@ def verify_full(fill_value, src_shape, dtype):
         ref_res = np.full(src_shape, fill_value).astype(dtype)
 
         verify_func(
-            func, [np.array(fill_value).astype(dtype), np.array(src_shape).astype("int64")], ref_res
+            executor_kind,
+            func,
+            [np.array(fill_value).astype(dtype), np.array(src_shape).astype("int64")],
+            ref_res,
         )
 
     verify_full(4, (1, 3, 4, 4), "int32")
@@ -201,7 +207,7 @@ def verify_full(fill_value, src_shape, dtype):
 
 
 @tvm.testing.uses_gpu
-def test_dyn_sparse_to_dense():
+def test_dyn_sparse_to_dense(executor_kind):
     def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_shape, xpected):
         sparse_indices_data = np.array(sparse_indices)
         sparse_values_data = np.array(sparse_values)
@@ -242,7 +248,7 @@ def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_
                 output_shape_data,
             ]
 
-        verify_func(func, arguments, xpected)
+        verify_func(executor_kind, func, arguments, xpected)
 
     verify_sparse_to_dense(1, 3, 0, [5], [0, 3, 0, 0, 0])  # scalar
     verify_sparse_to_dense([0, 1, 4], [3, 3, 3], 0, [5], [3, 3, 0, 0, 3])  # vector
@@ -301,7 +307,7 @@ def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_
 @pytest.mark.parametrize("dtype", [np.int64, np.int32])
 @pytest.mark.parametrize("use_dyn", [True, False])
 def test_sparse_fill_empty_rows(
-    sparse_indices, sparse_values, dense_shape, default_value, dtype, use_dyn
+    sparse_indices, sparse_values, dense_shape, default_value, dtype, use_dyn, executor_kind
 ):
     def ref_sparse_fill_empty_rows(
         sparse_indices: np.ndarray,
@@ -404,6 +410,7 @@ def verify_sparse_fill_empty_rows(
         assert empty_row_indicator_infer_type.checked_type.dtype == "bool"
 
         verify_func(
+            executor_kind,
             func,
             [sparse_indices_np, sparse_values_np, dense_shape_np, default_value_np],
             ref_res,
diff --git a/tests/python/relay/dyn/test_dynamic_op_level5.py b/tests/python/relay/dyn/test_dynamic_op_level5.py
index 2eeeb1d828c98..58234929c7bbc 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level5.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level5.py
@@ -26,6 +26,8 @@
 import tvm.topi.testing
 import tvm.testing
 
+executor_kind = tvm.testing.parameter("debug", "vm")
+
 
 def test_resize2d_infer_type():
     n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
@@ -37,7 +39,7 @@ def test_resize2d_infer_type():
 
 
 @tvm.testing.uses_gpu
-def test_resize2d():
+def test_resize2d(executor_kind):
     def verify_resize2d(dshape, scale, method, layout):
         if layout == "NHWC":
             size = (dshape[1] * scale, dshape[2] * scale)
@@ -62,12 +64,11 @@ def verify_resize2d(dshape, scale, method, layout):
         )
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["vm", "debug"]:
-                mod = tvm.ir.IRModule.from_expr(func)
-                op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()(
-                    x_data, size
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4, atol=1e-6)
+            mod = tvm.ir.IRModule.from_expr(func)
+            op_res = relay.create_executor(
+                executor_kind, mod=mod, device=dev, target=target
+            ).evaluate()(x_data, size)
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4, atol=1e-6)
 
     for method in ["linear", "nearest_neighbor"]:
         for layout in ["NCHW", "NHWC"]:
diff --git a/tests/python/relay/dyn/test_dynamic_op_level6.py b/tests/python/relay/dyn/test_dynamic_op_level6.py
index 530c402b2947b..ebf9c36263bef 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level6.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level6.py
@@ -22,9 +22,11 @@
 from tvm import relay
 import tvm.testing
 
+executor_kind = tvm.testing.parameter("debug", "vm")
+
 
 @tvm.testing.uses_gpu
-def test_dynamic_topk():
+def test_dynamic_topk(executor_kind):
     def verify_topk(k, axis, ret_type, is_ascend, dtype):
         shape = (20, 100)
         x = relay.var("x", relay.TensorType(shape, "float32"))
@@ -53,18 +55,17 @@ def verify_topk(k, axis, ret_type, is_ascend, dtype):
         np_indices = np_indices.astype(dtype)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["vm", "debug"]:
-                mod = tvm.ir.IRModule.from_expr(func)
-                op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()(
-                    np_data, np.array([k]).astype("float32")
-                )
-                if ret_type == "both":
-                    tvm.testing.assert_allclose(op_res[0].numpy(), np_values)
-                    tvm.testing.assert_allclose(op_res[1].numpy(), np_indices)
-                elif ret_type == "values":
-                    tvm.testing.assert_allclose(op_res.numpy(), np_values)
-                else:
-                    tvm.testing.assert_allclose(op_res.numpy(), np_indices)
+            mod = tvm.ir.IRModule.from_expr(func)
+            op_res = relay.create_executor(
+                executor_kind, mod=mod, device=dev, target=target
+            ).evaluate()(np_data, np.array([k]).astype("float32"))
+            if ret_type == "both":
+                tvm.testing.assert_allclose(op_res[0].numpy(), np_values)
+                tvm.testing.assert_allclose(op_res[1].numpy(), np_indices)
+            elif ret_type == "values":
+                tvm.testing.assert_allclose(op_res.numpy(), np_values)
+            else:
+                tvm.testing.assert_allclose(op_res.numpy(), np_indices)
 
     np.random.seed(0)
     for k in [0, 1, 5]:
diff --git a/tests/python/relay/test_op_grad_level1.py b/tests/python/relay/test_op_grad_level1.py
index a31191a42c48f..cb94f297cfa32 100644
--- a/tests/python/relay/test_op_grad_level1.py
+++ b/tests/python/relay/test_op_grad_level1.py
@@ -26,6 +26,8 @@
 from tvm.relay.testing import check_grad, run_infer_type
 from tvm.relay.transform import gradient
 
+executor_kind = tvm.testing.parameter("debug")
+
 
 def sigmoid(x):
     one = np.ones_like(x)
@@ -67,7 +69,7 @@ class TestUnaryOp:
     dtype = tvm.testing.parameter("float32", "float64")
     shape = tvm.testing.parameter((10, 4))
 
-    def test_op(self, target, dev, relay_op, ref_func, shape, dtype):
+    def test_op(self, target, dev, executor_kind, relay_op, ref_func, shape, dtype):
 
         target = tvm.target.Target(target)
         if target.kind.name == "vulkan":
@@ -125,9 +127,9 @@ def test_op(self, target, dev, relay_op, ref_func, shape, dtype):
         grad_in = np.random.rand(*shape).astype(dtype)
         ref_grad_out = ref_func(data_in, grad_in)
 
-        op_res, (op_grad, _) = relay.create_executor(device=dev, target=target).evaluate(bwd_func)(
-            data_in, grad_in
-        )
+        op_res, (op_grad, _) = relay.create_executor(
+            executor_kind, device=dev, target=target
+        ).evaluate(bwd_func)(data_in, grad_in)
         np.testing.assert_allclose(op_grad.numpy(), ref_grad_out, rtol=0.01)
 
 
@@ -143,7 +145,7 @@ class TestBinaryOp:
     dtype = tvm.testing.parameter("float32", "float64")
     shape = tvm.testing.parameter((5, 10, 5))
 
-    def test_binary_op(self, target, dev, relay_op, ref_func, shape, dtype):
+    def test_binary_op(self, target, dev, executor_kind, relay_op, ref_func, shape, dtype):
         t = relay.TensorType(shape, dtype=dtype)
         x = relay.var("x", t)
         y = relay.var("y", t)
@@ -156,31 +158,31 @@ def test_binary_op(self, target, dev, relay_op, ref_func, shape, dtype):
         fwd_func = run_infer_type(fwd_func)
         bwd_func = run_infer_type(gradient(fwd_func))
 
-        op_res, (op_grad0, op_grad1) = relay.create_executor(device=dev, target=target).evaluate(
-            bwd_func
-        )(x_data, y_data)
+        op_res, (op_grad0, op_grad1) = relay.create_executor(
+            executor_kind, device=dev, target=target
+        ).evaluate(bwd_func)(x_data, y_data)
         np.testing.assert_allclose(op_grad0.numpy(), ref_grad0, rtol=0.01)
         np.testing.assert_allclose(op_grad1.numpy(), ref_grad1, rtol=0.01)
 
 
-def test_softmax_grad(target, dev):
+def test_softmax_grad(executor_kind, target, dev):
     target = tvm.target.Target(target)
     if target.kind.name == "vulkan":
         pytest.xfail("Known failure on vulkan")
 
     data = relay.var("data", relay.TensorType((1, 16), "float64"))
     fwd_func = relay.Function([data], relay.nn.softmax(data))
-    check_grad(fwd_func, scale=1, target_devices=[(target, dev)])
+    check_grad(fwd_func, scale=1, target_devices=[(target, dev)], executor_kind=executor_kind)
 
 
-def test_log_softmax_grad(target, dev):
+def test_log_softmax_grad(executor_kind, target, dev):
     target = tvm.target.Target(target)
     if target.kind.name == "vulkan":
         pytest.xfail("Known failure on vulkan")
 
     data = relay.var("data", relay.TensorType((2, 16), "float64"))
     fwd_func = relay.Function([data], relay.nn.log_softmax(data))
-    check_grad(fwd_func, scale=1, target_devices=[(target, dev)])
+    check_grad(fwd_func, scale=1, target_devices=[(target, dev)], executor_kind=executor_kind)
 
 
 class TestBiasAddGrad:
@@ -191,25 +193,25 @@ class TestBiasAddGrad:
         ((4, 8), (8,), 1),
     )
 
-    def test_bias_add(self, target, dev, d_shape, b_shape, axis):
+    def test_bias_add(self, executor_kind, target, dev, d_shape, b_shape, axis):
         data = relay.var("data", relay.TensorType(d_shape, "float32"))
         bias = relay.var("bias", relay.TensorType(b_shape, "float32"))
         fwd_func = relay.Function([data, bias], relay.nn.bias_add(data, bias, axis=axis))
-        check_grad(fwd_func, target_devices=[(target, dev)])
+        check_grad(fwd_func, target_devices=[(target, dev)], executor_kind=executor_kind)
 
 
-def test_expand_dims_grad(target, dev):
+def test_expand_dims_grad(executor_kind, target, dev):
     data = relay.var("data", shape=(2, 3), dtype="float64")
     fwd_func = relay.Function([data], relay.expand_dims(data, axis=1, num_newaxis=2))
-    check_grad(fwd_func, target_devices=[(target, dev)])
+    check_grad(fwd_func, target_devices=[(target, dev)], executor_kind=executor_kind)
 
 
-def test_concatenate_grad(target, dev):
+def test_concatenate_grad(executor_kind, target, dev):
     x = relay.var("x", shape=(2, 2, 5))
     y = relay.var("y", shape=(2, 1, 5))
     z = relay.var("z", shape=(2, 4, 5))
     fwd_func = relay.Function([x, y, z], relay.concatenate([x, y, z], axis=1))
-    check_grad(fwd_func, target_devices=[(target, dev)])
+    check_grad(fwd_func, target_devices=[(target, dev)], executor_kind=executor_kind)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/relay/test_op_grad_level10.py b/tests/python/relay/test_op_grad_level10.py
index 4c2c9082e0443..6b2531a4a1f6b 100644
--- a/tests/python/relay/test_op_grad_level10.py
+++ b/tests/python/relay/test_op_grad_level10.py
@@ -28,9 +28,10 @@
 
 index_dtype = tvm.testing.parameter("int32", "int64")
 val_dtype = tvm.testing.parameter("float32", "float64")
+executor_kind = tvm.testing.parameter("debug")
 
 
-def test_cross_entropy_grad(target, dev, val_dtype):
+def test_cross_entropy_grad(executor_kind, target, dev, val_dtype):
     target = tvm.target.Target(target)
     if target.kind.name == "vulkan" and val_dtype == "float64":
         # GLSL.std.450's Log implementation only takes 16/32-bit floats.
@@ -44,10 +45,11 @@ def test_cross_entropy_grad(target, dev, val_dtype):
         scale=0.1,
         mean=1,
         target_devices=[(target, dev)],
+        executor_kind=executor_kind,
     )
 
 
-def test_cross_entropy_with_logits_grad(target, dev, val_dtype):
+def test_cross_entropy_with_logits_grad(executor_kind, target, dev, val_dtype):
     x = relay.var("x", shape=(2, 5), dtype=val_dtype)
     y = relay.var("y", shape=(2, 5), dtype=val_dtype)
     check_grad(
@@ -56,13 +58,16 @@ def test_cross_entropy_with_logits_grad(target, dev, val_dtype):
         scale=0.1,
         mean=1,
         target_devices=[(target, dev)],
+        executor_kind=executor_kind,
     )
 
 
-def test_checkpoint(target, dev):
+def test_checkpoint(executor_kind, target, dev):
     inputs = [relay.var("x{}".format(i), shape=(1,)) for i in range(4)]
     output = relay.multiply(relay.add(inputs[0], inputs[1]), relay.add(inputs[2], inputs[3]))
-    check_grad(relay.Function(inputs, relay.annotation.checkpoint(output)))
+    check_grad(
+        relay.Function(inputs, relay.annotation.checkpoint(output)), executor_kind=executor_kind
+    )
 
     scope = relay.ScopeBuilder()
     out_tuple = scope.let(
@@ -76,7 +81,11 @@ def test_checkpoint(target, dev):
         )
     )
     out_single = scope.get()
-    check_grad(relay.Function(inputs, out_single), target_devices=[(target, dev)])
+    check_grad(
+        relay.Function(inputs, out_single),
+        target_devices=[(target, dev)],
+        executor_kind=executor_kind,
+    )
 
 
 class TestBatchMatmulGrad:
@@ -87,7 +96,9 @@ class TestBatchMatmulGrad:
         ((2, 5, 3), (2, 4, 5), True, True),
     )
 
-    def test_batch_matmul_grad(self, target, dev, a_shape, b_shape, transpose_a, transpose_b):
+    def test_batch_matmul_grad(
+        self, executor_kind, target, dev, a_shape, b_shape, transpose_a, transpose_b
+    ):
         tensor_a = relay.var("tensor_a", relay.TensorType(a_shape, "float32"))
         tensor_b = relay.var("tensor_b", relay.TensorType(b_shape, "float32"))
         check_grad(
@@ -98,18 +109,20 @@ def test_batch_matmul_grad(self, target, dev, a_shape, b_shape, transpose_a, tra
                 ),
             ),
             target_devices=[(target, dev)],
+            executor_kind=executor_kind,
         )
 
 
-def test_reverse_reshape_grad(target, dev):
+def test_reverse_reshape_grad(executor_kind, target, dev):
     x = relay.var("x", shape=(3, 4, 5), dtype="float64")
     check_grad(
         relay.Function([x], relay.op.reverse_reshape(x, (-1, 0))),
         target_devices=[(target, dev)],
+        executor_kind=executor_kind,
     )
 
 
-def test_one_hot_grad(target, dev, index_dtype, val_dtype):
+def test_one_hot_grad(executor_kind, target, dev, index_dtype, val_dtype):
     indices_shape = (3, 4)
     depth = 5
     axis = -1
@@ -127,7 +140,13 @@ def test_one_hot_grad(target, dev, index_dtype, val_dtype):
     y = relay.one_hot(indices, on_val, off_val, depth, axis, val_dtype)
     f = relay.Function([indices, on_val, off_val], y)
 
-    check_grad(f, inputs=inputs, test_inputs=test_inputs, target_devices=[(target, dev)])
+    check_grad(
+        f,
+        inputs=inputs,
+        test_inputs=test_inputs,
+        target_devices=[(target, dev)],
+        executor_kind=executor_kind,
+    )
 
 
 if __name__ == "__main__":
diff --git a/tests/python/relay/test_op_grad_level2.py b/tests/python/relay/test_op_grad_level2.py
index fcdcfe6accd85..820f724bfc43d 100644
--- a/tests/python/relay/test_op_grad_level2.py
+++ b/tests/python/relay/test_op_grad_level2.py
@@ -25,8 +25,10 @@
 from tvm.relay.transform import gradient
 import tvm.testing
 
+executor_kind = tvm.testing.parameter("debug")
 
-def verify_max_pool2d_grad(x_shape, pool_size, strides, padding, ceil_mode):
+
+def verify_max_pool2d_grad(executor_kind, x_shape, pool_size, strides, padding, ceil_mode):
     x = relay.var("x", relay.TensorType(x_shape, "float32"))
     y = tvm.relay.nn.max_pool2d(
         x, pool_size=pool_size, strides=strides, padding=padding, ceil_mode=ceil_mode
@@ -51,24 +53,41 @@ def verify_max_pool2d_grad(x_shape, pool_size, strides, padding, ceil_mode):
     )
 
     for target, dev in tvm.testing.enabled_targets():
-        op_res, (op_grad,) = relay.create_executor(device=dev, target=target).evaluate(bwd_func)(
-            data
-        )
+        op_res, (op_grad,) = relay.create_executor(
+            executor_kind, device=dev, target=target
+        ).evaluate(bwd_func)(data)
         np.testing.assert_allclose(op_grad.numpy(), ref_grad, rtol=0.01)
 
 
 @tvm.testing.uses_gpu
-def test_max_pool2d_grad():
+def test_max_pool2d_grad(executor_kind):
     verify_max_pool2d_grad(
-        (1, 4, 16, 16), pool_size=(2, 2), strides=(2, 2), padding=(0, 0), ceil_mode=False
+        executor_kind,
+        (1, 4, 16, 16),
+        pool_size=(2, 2),
+        strides=(2, 2),
+        padding=(0, 0),
+        ceil_mode=False,
     )
     verify_max_pool2d_grad(
-        (1, 4, 16, 16), pool_size=(1, 1), strides=(1, 1), padding=(1, 1), ceil_mode=False
+        executor_kind,
+        (1, 4, 16, 16),
+        pool_size=(1, 1),
+        strides=(1, 1),
+        padding=(1, 1),
+        ceil_mode=False,
     )
 
 
 def verify_avg_pool2d_grad(
-    x_shape, pool_size, strides, padding, ceil_mode, count_include_pad, dtype="float32"
+    x_shape,
+    pool_size,
+    strides,
+    padding,
+    ceil_mode,
+    count_include_pad,
+    executor_kind,
+    dtype="float32",
 ):
 
     for shape_dtype in ["int32", "int64"]:
@@ -101,14 +120,14 @@ def verify_avg_pool2d_grad(
         )
 
         for target, dev in tvm.testing.enabled_targets():
-            op_res, (op_grad,) = relay.create_executor(device=dev, target=target).evaluate(
-                bwd_func
-            )(data)
+            op_res, (op_grad,) = relay.create_executor(
+                executor_kind, device=dev, target=target
+            ).evaluate(bwd_func)(data)
             np.testing.assert_allclose(op_grad.numpy(), ref_grad, rtol=0.01)
 
 
 @tvm.testing.uses_gpu
-def test_avg_pool2d_grad():
+def test_avg_pool2d_grad(executor_kind):
     verify_avg_pool2d_grad(
         (1, 4, 16, 16),
         pool_size=(2, 2),
@@ -116,6 +135,7 @@ def test_avg_pool2d_grad():
         padding=(0, 0),
         ceil_mode=False,
         count_include_pad=True,
+        executor_kind=executor_kind,
     )
     verify_avg_pool2d_grad(
         (1, 4, 16, 16),
@@ -124,6 +144,7 @@ def test_avg_pool2d_grad():
         padding=(1, 1),
         ceil_mode=False,
         count_include_pad=False,
+        executor_kind=executor_kind,
     )
     verify_avg_pool2d_grad(
         (1, 4, 16, 16),
@@ -132,11 +153,12 @@ def test_avg_pool2d_grad():
         padding=(1, 1),
         ceil_mode=False,
         count_include_pad=False,
+        executor_kind=executor_kind,
         dtype="int32",
     )
 
 
-def verify_global_avg_pool2d_grad(x_shape):
+def verify_global_avg_pool2d_grad(executor_kind, x_shape):
     x = relay.var("x", relay.TensorType(x_shape, "float32"))
     y = tvm.relay.nn.global_avg_pool2d(x)
 
@@ -158,19 +180,21 @@ def verify_global_avg_pool2d_grad(x_shape):
     )
 
     for target, dev in tvm.testing.enabled_targets():
-        op_res, (op_grad,) = relay.create_executor(device=dev, target=target).evaluate(bwd_func)(
-            data
-        )
+        op_res, (op_grad,) = relay.create_executor(
+            executor_kind, device=dev, target=target
+        ).evaluate(bwd_func)(data)
         np.testing.assert_allclose(op_grad.numpy(), ref_grad, rtol=0.01)
 
 
 @tvm.testing.uses_gpu
-def test_global_avg_pool2d_grad():
-    verify_global_avg_pool2d_grad((1, 4, 16, 16))
-    verify_global_avg_pool2d_grad((1, 8, 8, 24))
+def test_global_avg_pool2d_grad(executor_kind):
+    verify_global_avg_pool2d_grad(executor_kind, (1, 4, 16, 16))
+    verify_global_avg_pool2d_grad(executor_kind, (1, 8, 8, 24))
 
 
-def verify_conv2d_grad(dshape, wshape, strides, padding, dilation, groups=1, mode="higher_order"):
+def verify_conv2d_grad(
+    dshape, wshape, strides, padding, dilation, groups=1, mode="higher_order", executor_kind="vm"
+):
     dtype = "float32"
     data = relay.var("data", shape=dshape, dtype=dtype)
     weight = relay.var("weight", shape=wshape, dtype=dtype)
@@ -184,59 +208,73 @@ def verify_conv2d_grad(dshape, wshape, strides, padding, dilation, groups=1, mod
         out_dtype=dtype,
     )
     fwd_func = relay.Function([data, weight], conv)
-    check_grad(fwd_func, mode=mode)
+    check_grad(fwd_func, mode=mode, executor_kind=executor_kind)
 
 
 @tvm.testing.uses_gpu
-def test_conv2d_grad():
-    verify_conv2d_grad((1, 4, 16, 16), (16, 4, 3, 3), [1, 1], [1, 1], [1, 1])
-    verify_conv2d_grad((1, 4, 16, 16), (16, 4, 1, 1), [1, 1], [0, 0], [1, 1])
-    verify_conv2d_grad((1, 4, 16, 16), (16, 4, 1, 1), [2, 2], [0, 0], [1, 1])
-    verify_conv2d_grad((1, 4, 16, 16), (16, 4, 3, 3), [1, 1], [1, 1], [1, 1], mode="first_order")
+def test_conv2d_grad(executor_kind):
+    verify_conv2d_grad(
+        (1, 4, 16, 16), (16, 4, 3, 3), [1, 1], [1, 1], [1, 1], executor_kind=executor_kind
+    )
+    verify_conv2d_grad(
+        (1, 4, 16, 16), (16, 4, 1, 1), [1, 1], [0, 0], [1, 1], executor_kind=executor_kind
+    )
+    verify_conv2d_grad(
+        (1, 4, 16, 16), (16, 4, 1, 1), [2, 2], [0, 0], [1, 1], executor_kind=executor_kind
+    )
+    verify_conv2d_grad(
+        (1, 4, 16, 16),
+        (16, 4, 3, 3),
+        [1, 1],
+        [1, 1],
+        [1, 1],
+        mode="first_order",
+        executor_kind=executor_kind,
+    )
 
 
-def verify_dense_grad(d_shape, w_shape):
+def verify_dense_grad(d_shape, w_shape, executor_kind):
     data = relay.var("data", relay.TensorType(d_shape, "float32"))
     weight = relay.var("weight", relay.TensorType(w_shape, "float32"))
     fwd_func = relay.Function([data, weight], relay.nn.dense(data, weight))
-    check_grad(fwd_func)
+    check_grad(fwd_func, executor_kind=executor_kind)
 
 
-def test_dense_grad():
-    verify_dense_grad((1, 8), (16, 8))
-    verify_dense_grad((1, 4), (3, 4))
-    verify_dense_grad((5, 4), (3, 4))
+def test_dense_grad(executor_kind):
+    verify_dense_grad((1, 8), (16, 8), executor_kind)
+    verify_dense_grad((1, 4), (3, 4), executor_kind)
+    verify_dense_grad((5, 4), (3, 4), executor_kind)
 
 
-def verify_matmul_grad(a_shape, b_shape, transpose_a, transpose_b):
+def verify_matmul_grad(a_shape, b_shape, transpose_a, transpose_b, executor_kind):
     tensor_a = relay.var("tensor_a", relay.TensorType(a_shape, "float32"))
     tensor_b = relay.var("tensor_b", relay.TensorType(b_shape, "float32"))
     fwd_func = relay.Function(
         [tensor_a, tensor_b],
         relay.nn.matmul(tensor_a, tensor_b, transpose_a=transpose_a, transpose_b=transpose_b),
     )
-    check_grad(fwd_func)
+    check_grad(fwd_func, executor_kind=executor_kind)
 
 
-def test_matmul_grad():
-    verify_matmul_grad((1, 8), (8, 16), False, False)
-    verify_matmul_grad((4, 1), (4, 3), True, False)
-    verify_matmul_grad((4, 5), (3, 4), True, True)
+def test_matmul_grad(executor_kind):
+    verify_matmul_grad((1, 8), (8, 16), False, False, executor_kind)
+    verify_matmul_grad((4, 1), (4, 3), True, False, executor_kind)
+    verify_matmul_grad((4, 5), (3, 4), True, True, executor_kind)
 
 
-def verify_batch_flatten_grad(d_shape):
+def verify_batch_flatten_grad(d_shape, executor_kind):
     data = relay.var("data", relay.TensorType(d_shape, "float32"))
     fwd_func = relay.Function([data], relay.nn.batch_flatten(data))
-    check_grad(fwd_func)
+    check_grad(fwd_func, executor_kind=executor_kind)
 
 
-def test_batch_flatten_grad():
-    verify_batch_flatten_grad((1, 2, 3, 4))
-    verify_batch_flatten_grad((1, 8))
+def test_batch_flatten_grad(executor_kind):
+    verify_batch_flatten_grad((1, 2, 3, 4), executor_kind)
+    verify_batch_flatten_grad((1, 8), executor_kind)
 
 
 def verify_conv2d_backward_weight(
-    dy_shape, x_shape, kernel_size, stride, padding, groups=1, out_channels=None
+    executor_kind, dy_shape, x_shape, kernel_size, stride, padding, groups=1, out_channels=None
 ):
     dtype = "float32"
     dy = relay.var("dy", shape=dy_shape, dtype=dtype)
@@ -265,7 +303,11 @@ def verify_conv2d_backward_weight(
         dy_np = np.random.randn(*dy_shape).astype(dtype)
         x_np = np.random.randn(*x_shape).astype(dtype)
 
-        dw_np = relay.create_executor(device=dev, target=target).evaluate(dw)(dy_np, x_np).numpy()
+        dw_np = (
+            relay.create_executor(executor_kind, device=dev, target=target)
+            .evaluate(dw)(dy_np, x_np)
+            .numpy()
+        )
         ref_dw_np = tvm.topi.testing.conv2d_backward_weight_python(
             dy_np, x_np, kernel_size, stride, padding, groups=groups, channels=out_channels
         )
@@ -273,11 +315,22 @@ def verify_conv2d_backward_weight(
         np.testing.assert_allclose(dw_np, ref_dw_np, rtol=1e-4, atol=1e-4)
 
 
-def test_conv2d_backward_weight():
-    verify_conv2d_backward_weight((2, 8, 32, 32), (2, 4, 32, 32), (3, 3), (1, 1), (1, 1))
-    verify_conv2d_backward_weight((2, 16, 15, 15), (2, 3, 32, 32), (3, 3), (2, 2), (0, 0))
+def test_conv2d_backward_weight(executor_kind):
+    verify_conv2d_backward_weight(
+        executor_kind, (2, 8, 32, 32), (2, 4, 32, 32), (3, 3), (1, 1), (1, 1)
+    )
+    verify_conv2d_backward_weight(
+        executor_kind, (2, 16, 15, 15), (2, 3, 32, 32), (3, 3), (2, 2), (0, 0)
+    )
     verify_conv2d_backward_weight(
-        (1, 16, 32, 32), (1, 16, 32, 32), (3, 3), (1, 1), (1, 1), groups=16, out_channels=16
+        executor_kind,
+        (1, 16, 32, 32),
+        (1, 16, 32, 32),
+        (3, 3),
+        (1, 1),
+        (1, 1),
+        groups=16,
+        out_channels=16,
     )
 
 
diff --git a/tests/python/relay/test_op_grad_level3.py b/tests/python/relay/test_op_grad_level3.py
index 30d849853d879..89b8199b9e22a 100644
--- a/tests/python/relay/test_op_grad_level3.py
+++ b/tests/python/relay/test_op_grad_level3.py
@@ -24,9 +24,11 @@
 from tvm.relay.transform import gradient
 import tvm.testing
 
+executor_kind = tvm.testing.parameter("debug")
+
 
 @tvm.testing.uses_gpu
-def test_clip():
+def test_clip(executor_kind):
     for dtype in ("float32", "float64"):
         ref = lambda x: np.where(
             x > 10.0, np.zeros_like(x), np.where(x < 1.0, np.zeros_like(x), np.ones_like(x))
@@ -41,49 +43,49 @@ def test_clip():
         bwd_func = run_infer_type(gradient(fwd_func))
 
         for target, dev in tvm.testing.enabled_targets():
-            op_res, (op_grad,) = relay.create_executor(device=dev, target=target).evaluate(
-                bwd_func
-            )(data)
+            op_res, (op_grad,) = relay.create_executor(
+                executor_kind, device=dev, target=target
+            ).evaluate(bwd_func)(data)
             np.testing.assert_allclose(op_grad.numpy(), ref_grad, rtol=0.01)
 
 
-def verify_transpose_grad(d_shape, axes=None):
+def verify_transpose_grad(d_shape, axes=None, executor_kind="vm"):
     data = relay.var("data", relay.TensorType(d_shape, "float32"))
     fwd_func = relay.Function([data], relay.transpose(data, axes=axes))
-    check_grad(fwd_func)
+    check_grad(fwd_func, executor_kind=executor_kind)
 
 
-def test_transpose_grad():
-    verify_transpose_grad((1, 2, 3, 4))
-    verify_transpose_grad((1, 2, 3, 4), axes=(0, 2, 3, 1))
+def test_transpose_grad(executor_kind):
+    verify_transpose_grad((1, 2, 3, 4), executor_kind=executor_kind)
+    verify_transpose_grad((1, 2, 3, 4), axes=(0, 2, 3, 1), executor_kind=executor_kind)
 
 
-def test_negative_grad():
+def test_negative_grad(executor_kind):
     data = relay.var("data", relay.TensorType((10, 4), "float32"))
     fwd_func = relay.Function([data], relay.negative(data))
-    check_grad(fwd_func)
+    check_grad(fwd_func, executor_kind=executor_kind)
 
 
-def test_cast_grad():
+def test_cast_grad(executor_kind):
     data = relay.var("data", relay.TensorType((10, 4), "float32"))
     fwd_func = relay.Function([data], relay.cast(data, "float64"))
-    check_grad(fwd_func)
+    check_grad(fwd_func, executor_kind=executor_kind)
 
 
-def test_cast_like_grad():
+def test_cast_like_grad(executor_kind):
     data = relay.var("data", shape=(10, 4), dtype="float32")
     like = relay.var("like", shape=(1,), dtype="float64")
     fwd_func = relay.Function([data, like], relay.cast_like(data, like))
-    check_grad(fwd_func)
+    check_grad(fwd_func, executor_kind=executor_kind)
 
 
-def test_copy_grad():
+def test_copy_grad(executor_kind):
     data = relay.var("data", relay.TensorType((10, 4), "float64"))
     fwd_func = relay.Function([data], relay.copy(data))
-    check_grad(fwd_func)
+    check_grad(fwd_func, executor_kind=executor_kind)
 
 
-def test_take_grad():
+def test_take_grad(executor_kind):
     data_dtype = relay.TensorType((3, 4, 5), "float64")
     data = relay.var("data", data_dtype)
     indices = relay.var("indices", relay.TensorType((relay.Any(),), "int32"))
@@ -92,28 +94,28 @@ def test_take_grad():
 
     # take on axis
     fwd_func = relay.Function([data, indices], relay.take(data, indices, axis=1))
-    check_grad(fwd_func, inputs=inputs, test_inputs=test_inputs)
+    check_grad(fwd_func, inputs=inputs, test_inputs=test_inputs, executor_kind=executor_kind)
 
     # take on flattened
     fwd_func = relay.Function([data, indices], relay.take(data, indices, axis=None))
-    check_grad(fwd_func, inputs=inputs, test_inputs=test_inputs)
+    check_grad(fwd_func, inputs=inputs, test_inputs=test_inputs, executor_kind=executor_kind)
 
 
-def test_stack_grad():
+def test_stack_grad(executor_kind):
     args = [relay.var(c, shape=(2, 3, 4), dtype="float64") for c in "xyz"]
     fwd_func = relay.Function(args, relay.stack(args, axis=0))
-    check_grad(fwd_func)
+    check_grad(fwd_func, executor_kind=executor_kind)
 
 
-def test_squeeze_grad():
+def test_squeeze_grad(executor_kind):
     data = relay.var("data", shape=(2, 1, 1, 3, 4, 1), dtype="float64")
     fwd_func = relay.Function([data], relay.squeeze(data))
     fwd_func_subset = relay.Function([data], relay.squeeze(data, axis=[1, -1]))
-    check_grad(fwd_func)
-    check_grad(fwd_func_subset)
+    check_grad(fwd_func, executor_kind=executor_kind)
+    check_grad(fwd_func_subset, executor_kind=executor_kind)
 
 
-def test_arange_grad():
+def test_arange_grad(executor_kind):
     # TODO: testing arange numerically is strange because two-sided approx can
     #       produce different output shapes
     dtype = "float64"
@@ -122,23 +124,25 @@ def test_arange_grad():
     step = relay.var("step", relay.TensorType((), dtype))
     values = [np.array(v, dtype=dtype) for v in [2.5, 9.5, 1.8]]
     fwd_func = relay.Function([start, stop, step], relay.arange(start, stop, step, dtype))
-    check_grad(fwd_func, inputs=values)
+    check_grad(fwd_func, inputs=values, executor_kind=executor_kind)
 
 
-def test_gather_nd_grad():
+def test_gather_nd_grad(executor_kind):
     data = relay.var("data", relay.TensorType((2, 3), "float64"))
     indices = relay.var("indices", relay.TensorType((2, 4), "int64"))
     fwd = relay.Function([data, indices], relay.gather_nd(data, indices))
     data_np = np.random.rand(2, 3).astype("float64")
     indices_np = np.array([[0, 1, 1, 0], [0, 1, 0, 0]], dtype="int64")
-    check_grad(fwd, inputs=[data_np, indices_np], test_inputs=[data_np])
+    check_grad(
+        fwd, inputs=[data_np, indices_np], test_inputs=[data_np], executor_kind=executor_kind
+    )
 
 
-def test_reshape_like_grad():
+def test_reshape_like_grad(executor_kind):
     data = relay.var("data", shape=(2, 3, 4), dtype="float32")
     shape_like = relay.var("shape_like", shape=(6, 2, 2), dtype="float32")
     fwd_func = relay.Function([data, shape_like], relay.reshape_like(data, shape_like))
-    check_grad(fwd_func)
+    check_grad(fwd_func, executor_kind=executor_kind)
 
 
 def test_zeros_ones_grad_const_ints():
@@ -172,7 +176,7 @@ def test_zeros_ones_grad_const_expr():
         tvm.ir.assert_structural_equal(bwd_func.ret_type, expected_ty_dyn)
 
 
-def test_zeros_ones_grad_dynamic():
+def test_zeros_ones_grad_dynamic(executor_kind):
     rank = np.random.randint(low=1, high=5, dtype="int32")
     dyn_shape = np.random.randint(low=1, high=4, size=(rank,), dtype="int32")
     shape_data = relay.var("shape_data", shape=(rank,), dtype="int32")
@@ -182,9 +186,9 @@ def test_zeros_ones_grad_dynamic():
         bwd_func = run_infer_type(gradient(run_infer_type(fwd_func)))
 
         for target, dev in tvm.testing.enabled_targets():
-            res, (grad,) = relay.create_executor(device=dev, target=target).evaluate(bwd_func)(
-                dyn_shape
-            )
+            res, (grad,) = relay.create_executor(executor_kind, device=dev, target=target).evaluate(
+                bwd_func
+            )(dyn_shape)
             tvm.testing.assert_allclose(res.numpy(), op_ref(dyn_shape, dtype="float32"))
             tvm.testing.assert_allclose(grad.numpy(), np.zeros((rank,), dtype="int32"))
 
diff --git a/tests/python/relay/test_op_grad_level4.py b/tests/python/relay/test_op_grad_level4.py
index 17d30cacac413..9ed2ef262777c 100644
--- a/tests/python/relay/test_op_grad_level4.py
+++ b/tests/python/relay/test_op_grad_level4.py
@@ -16,43 +16,46 @@
 # under the License.
 import pytest
 import numpy as np
+import tvm.testing
 from tvm import relay
 from tvm.relay.testing import check_grad, _np_randn_from_type
 
+executor_kind = tvm.testing.parameter("debug")
 
-def verify_reduction_grad(red_fn, d_shape, axis=None, keepdims=False, exclude=False):
+
+def verify_reduction_grad(executor_kind, red_fn, d_shape, axis=None, keepdims=False, exclude=False):
     data = relay.var("data", relay.TensorType(d_shape, "float32"))
     fwd_func = relay.Function([data], red_fn(data, axis=axis, keepdims=keepdims, exclude=exclude))
-    check_grad(fwd_func)
+    check_grad(fwd_func, executor_kind=executor_kind)
 
 
-def test_reduction_grad():
+def test_reduction_grad(executor_kind):
     def _unbiased_variance(x, axis=None, keepdims=False, exclude=False):
         return relay.variance(x, axis=axis, keepdims=keepdims, exclude=exclude, unbiased=True)
 
     for op in (relay.sum, relay.variance, _unbiased_variance, relay.mean):
-        verify_reduction_grad(op, (4, 2))
-        verify_reduction_grad(op, (4, 2), axis=-1, keepdims=True)
-        verify_reduction_grad(op, (4, 2, 1), axis=(1, 2), exclude=True)
-        verify_reduction_grad(op, (4, 2, 1), axis=1)
+        verify_reduction_grad(executor_kind, op, (4, 2))
+        verify_reduction_grad(executor_kind, op, (4, 2), axis=-1, keepdims=True)
+        verify_reduction_grad(executor_kind, op, (4, 2, 1), axis=(1, 2), exclude=True)
+        verify_reduction_grad(executor_kind, op, (4, 2, 1), axis=1)
 
 
-def verify_max_grad(d_shape, axis=None, keepdims=False, exclude=False):
+def verify_max_grad(executor_kind, d_shape, axis=None, keepdims=False, exclude=False):
     data = relay.var("data", relay.TensorType(d_shape, "float32"))
     fwd_func = relay.Function(
         [data], relay.max(data, axis=axis, keepdims=keepdims, exclude=exclude)
     )
-    check_grad(fwd_func, scale=1e-3)
+    check_grad(fwd_func, scale=1e-3, executor_kind=executor_kind)
 
 
-def test_max_grad():
-    verify_max_grad((10, 10), axis=None)
-    verify_max_grad((10, 10), axis=-1)
-    verify_max_grad((6, 3, 2), axis=(1, 2), keepdims=True)
-    verify_max_grad((5, 4, 3), axis=(0, 2), exclude=True)
+def test_max_grad(executor_kind):
+    verify_max_grad(executor_kind, (10, 10), axis=None)
+    verify_max_grad(executor_kind, (10, 10), axis=-1)
+    verify_max_grad(executor_kind, (6, 3, 2), axis=(1, 2), keepdims=True)
+    verify_max_grad(executor_kind, (5, 4, 3), axis=(0, 2), exclude=True)
 
 
-def test_where_grad():
+def test_where_grad(executor_kind):
     cond_type = relay.TensorType((2, 3, 4), "int32")
     lhs_type = relay.TensorType((1, 3, 4), "float32")
     rhs_type = relay.TensorType((2, 1, 4), "float32")
@@ -66,10 +69,10 @@ def test_where_grad():
     lhs = relay.var("lhs", type_annotation=lhs_type)
     rhs = relay.var("rhs", type_annotation=rhs_type)
     fwd_func = relay.Function([cond, lhs, rhs], relay.where(cond, lhs, rhs))
-    check_grad(fwd_func, inputs=inputs, test_inputs=inputs[1:])
+    check_grad(fwd_func, inputs=inputs, test_inputs=inputs[1:], executor_kind=executor_kind)
 
 
-def test_less_equal_grad():
+def test_less_equal_grad(executor_kind):
     x_type = relay.TensorType((2, 3, 4), "float32")
     y_type = relay.TensorType((3, 1), "float32")
     # We need to generate inputs far apart to get correct numerical gradients
@@ -83,10 +86,10 @@ def test_less_equal_grad():
     x = relay.var("x", type_annotation=x_type)
     y = relay.var("y", type_annotation=y_type)
     fwd_func = relay.Function([x, y], relay.less_equal(x, y))
-    check_grad(fwd_func, inputs=inputs, test_inputs=inputs, eps=1e-6)
+    check_grad(fwd_func, inputs=inputs, test_inputs=inputs, eps=1e-6, executor_kind=executor_kind)
 
 
-def test_not_equal_grad():
+def test_not_equal_grad(executor_kind):
     x_type = relay.TensorType((2, 3, 4), "float32")
     y_type = relay.TensorType((3, 1), "float32")
     # We need to generate inputs far apart to get correct numerical gradients
@@ -100,17 +103,17 @@ def test_not_equal_grad():
     x = relay.var("x", type_annotation=x_type)
     y = relay.var("y", type_annotation=y_type)
     fwd_func = relay.Function([x, y], relay.not_equal(x, y))
-    check_grad(fwd_func, inputs=inputs, test_inputs=inputs, eps=1e-6)
+    check_grad(fwd_func, inputs=inputs, test_inputs=inputs, eps=1e-6, executor_kind=executor_kind)
 
 
-def test_strided_slice_grad():
+def test_strided_slice_grad(executor_kind):
     def check(sh, dtype, begin, end, strides, slice_mode):
         x = relay.var("x", shape=sh, dtype=dtype)
         f = relay.Function(
             [x],
             relay.strided_slice(x, begin=begin, end=end, strides=strides, slice_mode=slice_mode),
         )
-        check_grad(f)
+        check_grad(f, executor_kind=executor_kind)
 
     check((2, 3, 4), "float32", (0, 1, 0), (-1, -1, 1), (1, 1, 1), "size")
     check((2, 3, 4), "float32", (0, 1, 0), (2, 3, 1), (1, 1, 1), "end")
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index d4238f81e01b1..1b72e5ce51378 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -26,6 +26,8 @@
 from tvm.contrib.nvcc import have_fp16
 import tvm.testing
 
+executor_kind = tvm.testing.parameter("graph", "vm")
+
 
 def sigmoid(x):
     one = np.ones_like(x)
@@ -286,7 +288,7 @@ def test_log_softmax():
 
 
 @tvm.testing.uses_gpu
-def test_concatenate():
+def test_concatenate(executor_kind):
     for dtype in ["float16", "float32"]:
         n, t, d = te.size_var("n"), te.size_var("t"), 100
         x = relay.var("x", shape=(n, t, d))
@@ -336,17 +338,13 @@ def test_concatenate():
                 and not have_fp16(tvm.cuda(0).compute_version)
             ):
                 continue
-            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                x_data, y_data, t_data
-            )
-            tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=0.01)
-            op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
                 x_data, y_data, t_data
             )
-            tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=0.01)
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=0.01)
 
 
-def test_dropout():
+def test_dropout(executor_kind):
     for dtype in ["float16", "float32"]:
         n, t, d = te.size_var("n"), te.size_var("t"), te.size_var("d")
         input_ty = relay.TensorType((n, t, d), dtype)
@@ -361,9 +359,8 @@ def test_dropout():
     y = relay.nn.dropout(x, rate=0.5)
     func = relay.Function([], y)
     for target, dev in tvm.testing.enabled_targets():
-        for backend in ["debug", "graph"]:
-            op_res = relay.create_executor("debug", device=dev, target=target).evaluate(func)()
-            tvm.testing.assert_allclose(op_res.numpy(), in_np, rtol=0.01)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)()
+        tvm.testing.assert_allclose(op_res.numpy(), in_np, rtol=0.01)
 
 
 def test_batch_norm():
@@ -490,7 +487,7 @@ def test_matmul_type_check():
 
 
 @tvm.testing.uses_gpu
-def test_matmul():
+def test_matmul(executor_kind):
     for dtype in ["float16", "float32"]:
         # Matmul accuracy for float16 is poor
         if dtype == "float16":
@@ -529,14 +526,10 @@ def test_matmul():
         ref_res = np.dot(x_data.transpose(), w_data)
 
         for target, dev in tvm.testing.enabled_targets():
-            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
                 x_data, w_data
             )
-            tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5)
-            op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(
-                x_data, w_data
-            )
-            tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5)
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 @pytest.mark.xfail
@@ -552,7 +545,7 @@ def test_dense_type_check():
 
 
 @tvm.testing.uses_gpu
-def test_dense():
+def test_dense(executor_kind):
     for dtype in ["float16", "float32"]:
         # Dense accuracy for float16 is poor
         if dtype == "float16":
@@ -591,14 +584,10 @@ def test_dense():
         ref_res = np.dot(x_data, w_data.T)
 
         for target, dev in tvm.testing.enabled_targets():
-            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                x_data, w_data
-            )
-            tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5)
-            op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
                 x_data, w_data
             )
-            tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5)
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index 8ee5adbb318d6..7e0b8ad89f644 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -27,9 +27,11 @@
 from tvm.relay import transform
 from tvm.relay.testing import run_infer_type
 
+executor_kind = tvm.testing.parameter("graph", "vm")
+
 
 @tvm.testing.uses_gpu
-def test_checkpoint():
+def test_checkpoint(executor_kind):
     dtype = "float32"
     xs = [relay.var("x{}".format(i), dtype) for i in range(4)]
     f = relay.multiply(relay.add(xs[0], xs[1]), relay.add(xs[2], xs[3]))
@@ -41,12 +43,11 @@ def test_checkpoint():
 
     inputs = [np.random.uniform() for _ in range(len(xs))]
     for target, dev in tvm.testing.enabled_targets():
-        for kind in ["graph", "debug"]:
-            f_res = relay.create_executor(kind, device=dev, target=target).evaluate(f)(*inputs)
-            f_checkpoint_res = relay.create_executor(kind, device=dev, target=target).evaluate(
-                f_checkpoint
-            )(*inputs)
-            tvm.testing.assert_allclose(f_res.numpy(), f_checkpoint_res.numpy(), 0, 0)
+        f_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(f)(*inputs)
+        f_checkpoint_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(
+            f_checkpoint
+        )(*inputs)
+        tvm.testing.assert_allclose(f_res.numpy(), f_checkpoint_res.numpy(), 0, 0)
 
 
 def test_checkpoint_alpha_equal():
@@ -171,7 +172,7 @@ def test_checkpoint_alpha_equal_tuple():
 
 
 @tvm.testing.uses_gpu
-def test_collapse_sum_like():
+def test_collapse_sum_like(executor_kind):
     shape = (3, 4, 5, 6)
     shape_like = (4, 5, 6)
     dtype = "float32"
@@ -186,13 +187,14 @@ def test_collapse_sum_like():
     y = np.random.uniform(size=shape_like).astype(dtype)
     ref_res = np.sum(x, 0)
     for target, dev in tvm.testing.enabled_targets():
-        for kind in ["graph", "debug"]:
-            op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(x, y)
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x, y
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 @tvm.testing.uses_gpu
-def test_collapse_sum_to():
+def test_collapse_sum_to(executor_kind):
     shape = (3, 4, 5, 6)
     shape_to = (4, 5, 6)
     dtype = "float32"
@@ -205,13 +207,12 @@ def test_collapse_sum_to():
     x = np.random.uniform(size=shape).astype(dtype)
     ref_res = np.sum(x, 0)
     for target, dev in tvm.testing.enabled_targets():
-        for kind in ["graph", "debug"]:
-            op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(x)
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(x)
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 @tvm.testing.uses_gpu
-def test_broadcast_to():
+def test_broadcast_to(executor_kind):
     shape = (4, 1, 6)
     shape_like = (3, 4, 5, 6)
     dtype = "float32"
@@ -224,13 +225,12 @@ def test_broadcast_to():
     x = np.random.uniform(size=shape).astype(dtype)
     ref_res = np.broadcast_to(x, shape_like)
     for target, dev in tvm.testing.enabled_targets():
-        for kind in ["graph", "debug"]:
-            op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(x)
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(x)
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 @tvm.testing.uses_gpu
-def test_broadcast_to_const_shape_int64():
+def test_broadcast_to_const_shape_int64(executor_kind):
     shape_like = relay.const(np.array([1, 5]), dtype="int64")
     x = relay.var("x", shape=(1,), dtype="int64")
     z = relay.broadcast_to(x, shape=shape_like)
@@ -241,13 +241,12 @@ def test_broadcast_to_const_shape_int64():
     x = np.random.randint(10, size=(1,), dtype="int64")
     ref_res = np.broadcast_to(x, (5,))
     for target, dev in tvm.testing.enabled_targets():
-        for kind in ["graph", "debug"]:
-            op_res = relay.create_executor(kind, device=dev, target=target).evaluate(f)(x)
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(f)(x)
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res)
 
 
 @tvm.testing.uses_gpu
-def test_broadcast_to_like():
+def test_broadcast_to_like(executor_kind):
     shape = (4, 1, 6)
     shape_like = (3, 4, 5, 6)
     dtype = "float32"
@@ -264,9 +263,10 @@ def test_broadcast_to_like():
     ref_res = np.broadcast_to(x, shape_like)
 
     for target, dev in tvm.testing.enabled_targets():
-        for kind in ["graph", "debug"]:
-            op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(x, y)
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x, y
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 def np_slice_like(np_data, np_shape_like, axis=None):
@@ -288,7 +288,7 @@ def np_slice_like(np_data, np_shape_like, axis=None):
     return np_result
 
 
-def verify_slice_like(data, slice_like, axes, output, dtype="float32"):
+def verify_slice_like(executor_kind, data, slice_like, axes, output, dtype="float32"):
     x = relay.var("data", relay.TensorType(data, dtype))
     y = relay.var("slice_like", relay.TensorType(slice_like, dtype))
     z = relay.slice_like(x, y, axes)
@@ -308,31 +308,46 @@ def verify_slice_like(data, slice_like, axes, output, dtype="float32"):
     ref_res = np_slice_like(x_data, y_data, axes)
 
     for target, dev in tvm.testing.enabled_targets():
-        for kind in ["graph", "debug"]:
-            op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                x_data, y_data
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x_data, y_data
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 @tvm.testing.uses_gpu
-def test_slice_like():
+def test_slice_like(executor_kind):
     d1, d2, d3, d4 = te.var("d1"), te.var("d2"), te.var("d3"), te.var("d4")
-    verify_slice_like(data=(d1, d2, d3), slice_like=(1, 2, 3), axes=None, output=(1, 2, 3))
-    verify_slice_like(data=(1, 2, 3), slice_like=(d1, d2, d3), axes=None, output=(d1, d2, d3))
-    verify_slice_like(data=(d2, d3, d4), slice_like=(d1, d2, d3), axes=(1, 2), output=(d2, d2, d3))
-    verify_slice_like(data=(3, 4, 5), slice_like=(1, 2, 3), axes=None, output=(1, 2, 3))
-    verify_slice_like(data=(3, 4, 5), slice_like=(1, 2), axes=None, output=(1, 2, 5))
-    verify_slice_like(data=(3, 4, 5), slice_like=(1, 2, 3), axes=(1, 2), output=(3, 2, 3))
-    verify_slice_like(data=(3, 4, 5), slice_like=(1, 2, 3), axes=(-1, -3), output=(1, 4, 3))
     verify_slice_like(
-        data=(1, 3, 224, 224), slice_like=(1, 3, 112, 112), axes=(2, 3), output=(1, 3, 112, 112)
+        executor_kind, data=(d1, d2, d3), slice_like=(1, 2, 3), axes=None, output=(1, 2, 3)
+    )
+    verify_slice_like(
+        executor_kind, data=(1, 2, 3), slice_like=(d1, d2, d3), axes=None, output=(d1, d2, d3)
+    )
+    verify_slice_like(
+        executor_kind, data=(d2, d3, d4), slice_like=(d1, d2, d3), axes=(1, 2), output=(d2, d2, d3)
+    )
+    verify_slice_like(
+        executor_kind, data=(3, 4, 5), slice_like=(1, 2, 3), axes=None, output=(1, 2, 3)
+    )
+    verify_slice_like(executor_kind, data=(3, 4, 5), slice_like=(1, 2), axes=None, output=(1, 2, 5))
+    verify_slice_like(
+        executor_kind, data=(3, 4, 5), slice_like=(1, 2, 3), axes=(1, 2), output=(3, 2, 3)
+    )
+    verify_slice_like(
+        executor_kind, data=(3, 4, 5), slice_like=(1, 2, 3), axes=(-1, -3), output=(1, 4, 3)
+    )
+    verify_slice_like(
+        executor_kind,
+        data=(1, 3, 224, 224),
+        slice_like=(1, 3, 112, 112),
+        axes=(2, 3),
+        output=(1, 3, 112, 112),
     )
 
 
 @tvm.testing.uses_gpu
-def test_reverse_reshape():
-    def verify_reverse_reshape(shape, newshape, oshape):
+def test_reverse_reshape(executor_kind):
+    def verify_reverse_reshape(executor_kind, shape, newshape, oshape):
         x = relay.var("x", relay.TensorType(shape, "float32"))
         z = relay.reverse_reshape(x, newshape=newshape)
         zz = run_infer_type(z)
@@ -343,21 +358,20 @@ def verify_reverse_reshape(shape, newshape, oshape):
         x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
         ref_res = np.reshape(x_data, oshape)
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                x_data
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
-    verify_reverse_reshape((2, 3, 4), (4, 0, 2), (4, 3, 2))
-    verify_reverse_reshape((2, 3, 4), (2, 0, 0), (2, 3, 4))
-    verify_reverse_reshape((2, 3, 4), (0, -1), (3, 8))
-    verify_reverse_reshape((2, 3, 4), (-1, 0), (6, 4))
-    verify_reverse_reshape((2, 3, 4), (0, -3), (2, 12))
+    verify_reverse_reshape(executor_kind, (2, 3, 4), (4, 0, 2), (4, 3, 2))
+    verify_reverse_reshape(executor_kind, (2, 3, 4), (2, 0, 0), (2, 3, 4))
+    verify_reverse_reshape(executor_kind, (2, 3, 4), (0, -1), (3, 8))
+    verify_reverse_reshape(executor_kind, (2, 3, 4), (-1, 0), (6, 4))
+    verify_reverse_reshape(executor_kind, (2, 3, 4), (0, -3), (2, 12))
 
 
 def verify_batch_matmul_with_inputs(
-    x, y, x_np, y_np, out_shape, dtype="float32", trans_x=False, trans_y=True
+    executor_kind, x, y, x_np, y_np, out_shape, dtype="float32", trans_x=False, trans_y=True
 ):
     z = relay.nn.batch_matmul(x, y, transpose_a=trans_x, transpose_b=trans_y)
     zz = run_infer_type(z)
@@ -368,26 +382,29 @@ def verify_batch_matmul_with_inputs(
     z_np = tvm.topi.testing.batch_matmul(x_np, y_np, trans_x=trans_x, trans_y=trans_y)
 
     for target, dev in tvm.testing.enabled_targets():
-        for kind in ["graph", "debug"]:
-            if len(input_vars) == 2:
-                z = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_np, y_np
-                )
-            else:
-                z = relay.create_executor(kind, device=dev, target=target).evaluate(func)(x_np)
-            tvm.testing.assert_allclose(z.numpy(), z_np, rtol=1e-5, atol=1e-5)
+        if len(input_vars) == 2:
+            z = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                x_np, y_np
+            )
+        else:
+            z = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(x_np)
+        tvm.testing.assert_allclose(z.numpy(), z_np, rtol=1e-5, atol=1e-5)
 
 
-def verify_batch_matmul(x_shape, y_shape, out_shape, dtype="float32", trans_x=False, trans_y=True):
+def verify_batch_matmul(
+    executor_kind, x_shape, y_shape, out_shape, dtype="float32", trans_x=False, trans_y=True
+):
     x = relay.var("x", relay.TensorType(x_shape, dtype))
     y = relay.var("y", relay.TensorType(y_shape, dtype))
     x_np = np.random.uniform(size=x_shape).astype(dtype)
     y_np = np.random.uniform(size=y_shape).astype(dtype)
-    verify_batch_matmul_with_inputs(x, y, x_np, y_np, out_shape, dtype, trans_x, trans_y)
+    verify_batch_matmul_with_inputs(
+        executor_kind, x, y, x_np, y_np, out_shape, dtype, trans_x, trans_y
+    )
 
 
 @tvm.testing.uses_gpu
-def test_batch_matmul():
+def test_batch_matmul(executor_kind):
     b, m, n, k = te.size_var("b"), te.size_var("m"), te.size_var("n"), te.size_var("k")
     x = relay.var("x", relay.TensorType((b, m, k), "float32"))
     y = relay.var("y", relay.TensorType((b, n, k), "float32"))
@@ -395,17 +412,31 @@ def test_batch_matmul():
     zz = run_infer_type(z)
     assert zz.checked_type == relay.TensorType((b, m, n), "float32")
 
-    verify_batch_matmul((1, 16, 32), (1, 16, 32), (1, 16, 16), trans_x=False, trans_y=True)
-    verify_batch_matmul((5, 16, 32), (5, 16, 32), (5, 16, 16), trans_x=False, trans_y=True)
-    verify_batch_matmul((5, 16, 32), (5, 20, 32), (5, 16, 20), trans_x=False, trans_y=True)
-    verify_batch_matmul((30, 16, 32), (30, 20, 32), (30, 16, 20), trans_x=False, trans_y=True)
-    verify_batch_matmul((1, 32, 16), (1, 16, 32), (1, 16, 16), trans_x=True, trans_y=True)
-    verify_batch_matmul((5, 16, 32), (5, 32, 16), (5, 16, 16), trans_x=False, trans_y=False)
-    verify_batch_matmul((5, 32, 16), (5, 32, 20), (5, 16, 20), trans_x=True, trans_y=False)
+    verify_batch_matmul(
+        executor_kind, (1, 16, 32), (1, 16, 32), (1, 16, 16), trans_x=False, trans_y=True
+    )
+    verify_batch_matmul(
+        executor_kind, (5, 16, 32), (5, 16, 32), (5, 16, 16), trans_x=False, trans_y=True
+    )
+    verify_batch_matmul(
+        executor_kind, (5, 16, 32), (5, 20, 32), (5, 16, 20), trans_x=False, trans_y=True
+    )
+    verify_batch_matmul(
+        executor_kind, (30, 16, 32), (30, 20, 32), (30, 16, 20), trans_x=False, trans_y=True
+    )
+    verify_batch_matmul(
+        executor_kind, (1, 32, 16), (1, 16, 32), (1, 16, 16), trans_x=True, trans_y=True
+    )
+    verify_batch_matmul(
+        executor_kind, (5, 16, 32), (5, 32, 16), (5, 16, 16), trans_x=False, trans_y=False
+    )
+    verify_batch_matmul(
+        executor_kind, (5, 32, 16), (5, 32, 20), (5, 16, 20), trans_x=True, trans_y=False
+    )
 
     x_np = np.random.randn(10, 27, 64).astype("float32")
     x = relay.var("x", shape=x_np.shape)
-    verify_batch_matmul_with_inputs(x, x, x_np, x_np, (10, 27, 27))
+    verify_batch_matmul_with_inputs(executor_kind, x, x, x_np, x_np, (10, 27, 27))
 
 
 @pytest.mark.skip("Requires cascadelake")
@@ -492,13 +523,13 @@ def test_shape_of():
     for target, dev in tvm.testing.enabled_targets():
         # Because using graph executor, this op will be optimized after
         # constant folding pass, here we only test with interpreter
-        for kind in ["debug"]:
+        for kind in ["vm"]:
             op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(x_data)
             tvm.testing.assert_allclose(op_res.numpy(), np.array(shape).astype("int32"))
 
 
 @tvm.testing.uses_gpu
-def test_ndarray_size():
+def test_ndarray_size(executor_kind):
     def verify_ndarray_size(shape):
         x = relay.var("x", shape=shape)
         func = relay.Function([x], relay.op.ndarray_size(x))
@@ -507,11 +538,10 @@ def verify_ndarray_size(shape):
         x_data = np.random.uniform(size=shape).astype("float32")
         ref_res = np.size(x_data)
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res)
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                x_data
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res)
 
     verify_ndarray_size((2, 3, 5))
     verify_ndarray_size((2, 3, 5, 7))
@@ -573,7 +603,7 @@ def test_adaptive_pool():
 
 
 @tvm.testing.uses_gpu
-def test_sequence_mask():
+def test_sequence_mask(executor_kind):
     def _verify(data_shape, mask_value, axis, dtype, itype):
         max_length = data_shape[axis]
         nbatch = data_shape[1 - axis]
@@ -588,11 +618,10 @@ def _verify(data_shape, mask_value, axis, dtype, itype):
         gt_out_np = tvm.topi.testing.sequence_mask(data_np, valid_length_np, mask_value, axis)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                out_relay = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    data_np, valid_length_np
-                )
-                tvm.testing.assert_allclose(out_relay.numpy(), gt_out_np)
+            out_relay = relay.create_executor(executor_kind, device=dev, target=target).evaluate(
+                func
+            )(data_np, valid_length_np)
+            tvm.testing.assert_allclose(out_relay.numpy(), gt_out_np)
 
     _verify((5, 10), 0.0, 1, "float32", "int32")
     _verify((2, 3, 5, 3), 0.0, 0, "float32", "int64")
@@ -600,7 +629,7 @@ def _verify(data_shape, mask_value, axis, dtype, itype):
 
 
 @tvm.testing.uses_gpu
-def test_one_hot():
+def test_one_hot(executor_kind):
     def _get_oshape(indices_shape, depth, axis):
         oshape = []
         true_axis = len(indices_shape) if axis == -1 else axis
@@ -629,11 +658,10 @@ def _verify(indices_shape, depth, on_value, off_value, axis, dtype):
         out_np = tvm.topi.testing.one_hot(indices_np, on_value, off_value, depth, axis, dtype)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                out_relay = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    indices_np
-                )
-                tvm.testing.assert_allclose(out_relay.numpy(), out_np)
+            out_relay = relay.create_executor(executor_kind, device=dev, target=target).evaluate(
+                func
+            )(indices_np)
+            tvm.testing.assert_allclose(out_relay.numpy(), out_np)
 
     _verify((3,), 3, 1, 0, -1, "int32")
     _verify((3,), 3, 1.0, 0.0, -1, "float32")
@@ -644,7 +672,7 @@ def _verify(indices_shape, depth, on_value, off_value, axis, dtype):
 
 
 @tvm.testing.uses_gpu
-def test_matrix_set_diag():
+def test_matrix_set_diag(executor_kind):
     def _verify(input_shape, diagonal_shape, dtype, k=0, align="RIGHT_LEFT"):
         input = relay.var("input", relay.TensorType(input_shape, dtype))
         diagonal = relay.var("diagonal", relay.TensorType(diagonal_shape, dtype))
@@ -660,11 +688,10 @@ def _verify(input_shape, diagonal_shape, dtype, k=0, align="RIGHT_LEFT"):
         out_np = tvm.topi.testing.matrix_set_diag(input_np, diagonal_np, k, align)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                out_relay = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    input_np, diagonal_np
-                )
-                tvm.testing.assert_allclose(out_relay.numpy(), out_np)
+            out_relay = relay.create_executor(executor_kind, device=dev, target=target).evaluate(
+                func
+            )(input_np, diagonal_np)
+            tvm.testing.assert_allclose(out_relay.numpy(), out_np)
 
     _verify((2, 2), (2,), "float32")
     _verify((4, 3, 3), (4, 3), "int32")
@@ -675,7 +702,7 @@ def _verify(input_shape, diagonal_shape, dtype, k=0, align="RIGHT_LEFT"):
 
 
 @tvm.testing.parametrize_targets
-def test_nll_loss(dev, target):
+def test_nll_loss(executor_kind, dev, target):
     def _get_oshape(target_shape, reduction):
         if reduction == "none":
             return target_shape
@@ -702,11 +729,10 @@ def _verify(prediction_shape, reduction="mean", ignore_index=-100, dtype="float3
             predictions_np, targets_np, weights_np, reduction, ignore_index
         )
 
-        for kind in ["graph", "debug"]:
-            out_relay = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                predictions_np, targets_np, weights_np
-            )
-            tvm.testing.assert_allclose(out_relay.numpy(), out_np, rtol=1e-6, atol=1e-6)
+        out_relay = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            predictions_np, targets_np, weights_np
+        )
+        tvm.testing.assert_allclose(out_relay.numpy(), out_np, rtol=1e-6, atol=1e-6)
 
     _verify((10, 5))
     _verify((10, 5, 2, 2))
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index c644890bbcbeb..726ee578da85e 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -30,6 +30,8 @@
 from tvm.relay.testing import run_infer_type
 from tvm.topi.cuda.conv3d_winograd import _infer_tile_size
 
+executor_kind = tvm.testing.parameter("graph", "vm")
+
 
 @tvm.testing.uses_gpu
 def test_conv1d_infer_type():
@@ -1301,7 +1303,7 @@ def test_avg_pool2d_no_count_pad():
 
 
 @tvm.testing.uses_gpu
-def test_flatten_infer_type():
+def test_flatten_infer_type(executor_kind):
     d1, d2, d3, d4 = te.size_var("d1"), te.size_var("d2"), te.size_var("d3"), te.size_var("d4")
     x = relay.var("x", relay.TensorType((d1, d2, d3, d4), "float32"))
     y = relay.nn.batch_flatten(x)
@@ -1330,10 +1332,10 @@ def test_flatten_infer_type():
     ref_res = x_data.flatten().reshape(o_shape)
 
     for target, dev in tvm.testing.enabled_targets():
-        op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(x_data)
-        tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5)
-        op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(x_data)
-        tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x_data
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 @tvm.testing.uses_gpu
@@ -1438,7 +1440,7 @@ def _test_run(dtype):
 
 @tvm.testing.uses_gpu
 @pytest.mark.parametrize("dtype", ["float32", "float16"])
-def test_lrn(dtype):
+def test_lrn(executor_kind, dtype):
     n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
     x = relay.var("x", shape=(n, c, h, w), dtype=dtype)
     y = relay.nn.lrn(x, size=10, axis=2, bias=0.5, alpha=0.00001, beta=0.75)
@@ -1461,14 +1463,14 @@ def test_lrn(dtype):
     ref_res = tvm.topi.testing.lrn_python(x_data, size, axis, bias, alpha, beta)
 
     for target, dev in tvm.testing.enabled_targets():
-        op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(x_data)
-        tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5)
-        op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(x_data)
-        tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x_data
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 @tvm.testing.uses_gpu
-def test_l2_normalize():
+def test_l2_normalize(executor_kind):
     n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
     x = relay.var("x", shape=(n, c, h, w))
     y = relay.nn.l2_normalize(x, eps=0.001, axis=[1])
@@ -1489,10 +1491,10 @@ def test_l2_normalize():
     ref_res = tvm.topi.testing.l2_normalize_python(x_data, eps, axis)
 
     for target, dev in tvm.testing.enabled_targets():
-        op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(x_data)
-        tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5)
-        op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(x_data)
-        tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x_data
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 def batch_flatten(data):
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index ef4b45ade9aa6..9d27839c4703b 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -30,7 +30,7 @@
 
 from utils import ref_funcs
 
-executor_kind = tvm.testing.parameter("graph", "debug")
+executor_kind = tvm.testing.parameter("graph", "vm")
 
 
 class TestZerosOnes:
@@ -644,7 +644,7 @@ def test_full_like_infer_type():
     assert yy.checked_type == relay.TensorType((n, c, h, w), "float32")
 
 
-def test_infer_type_leaky_relu(target, dev):
+def test_infer_type_leaky_relu(target, dev, executor_kind):
     n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
     x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
     y = relay.nn.leaky_relu(x, alpha=0.1)
@@ -663,10 +663,8 @@ def test_infer_type_leaky_relu(target, dev):
     x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
     ref_res = np.where(x_data > 0, x_data, x_data * 0.1)
 
-    op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(x_data)
-    tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5)
-    op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(x_data)
-    tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5)
+    op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(x_data)
+    tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 class TestInferTypePrelu:
@@ -684,7 +682,7 @@ class TestInferTypePrelu:
         ((1, 2, 2, 3), None, 3, (1, 2, 2, 3)),
     )
 
-    def test_infer_type_prelu(self, target, dev, data, alpha, axis, output, dtype):
+    def test_infer_type_prelu(self, target, dev, executor_kind, data, alpha, axis, output, dtype):
         x = relay.var("data", relay.TensorType(data, dtype))
         if alpha:
             y = relay.var("alpha", relay.TensorType(alpha, dtype))
@@ -712,14 +710,10 @@ def test_infer_type_prelu(self, target, dev, data, alpha, axis, output, dtype):
         else:
             ref_res = (x_data < 0) * (x_data * a_data.reshape(1, 1, 3)) + (x_data >= 0) * x_data
 
-        op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-            x_data, a_data
-        )
-        tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5)
-        op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
             x_data, a_data
         )
-        tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5)
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 class TestArange:
@@ -1051,7 +1045,7 @@ class TestDynamicScatter:
         ((16, 16, 4, 5), (16, 16, 4, 5), 3),
     )
 
-    @pytest.mark.parametrize("executor_kind", ["vm", "debug"])
+    @pytest.mark.parametrize("executor_kind", ["vm"])
     def test_dynamic_scatter(self, target, dev, executor_kind, dshape, ishape, axis):
         d = relay.var("d", relay.TensorType([relay.Any() for i in range(len(dshape))], "float32"))
         i = relay.var("i", relay.TensorType([relay.Any() for i in range(len(ishape))], "int64"))
@@ -2033,31 +2027,30 @@ def verify_unique(n, dtype, is_dyn=False, is_sorted=False, return_counts=False):
         x_data = np.random.randint(50, size=n).astype(dtype)
 
         if is_dyn:
-            backends = ["vm", "debug"]
+            backend = "vm"
         else:
-            backends = ["graph", "debug"]
-
-        for kind in backends:
-            mod = tvm.ir.IRModule.from_expr(func)
-            tvm_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()(
-                x_data
-            )  # unique, indices, inverse_indices, num_unique, (counts)
-            np_res = calc_numpy_unique(
-                x_data, is_sorted
-            )  # unique, indices, inverse_indices, num_unique, counts
-            num_unique = np_res[3][0]
-
-            # num_unique
-            assert num_unique == tvm_res[3].numpy()[0]
-            # unique
-            tvm.testing.assert_allclose(tvm_res[0].numpy()[:num_unique], np_res[0], rtol=1e-5)
-            # indices
-            tvm.testing.assert_allclose(tvm_res[1].numpy()[:num_unique], np_res[1], rtol=1e-5)
-            # inverse_indices
-            tvm.testing.assert_allclose(tvm_res[2].numpy(), np_res[2], rtol=1e-5)
-            # counts
-            if return_counts:
-                tvm.testing.assert_allclose(tvm_res[4].numpy()[:num_unique], np_res[4], rtol=1e-5)
+            backend = "graph"
+
+        mod = tvm.ir.IRModule.from_expr(func)
+        tvm_res = relay.create_executor(backend, mod=mod, device=dev, target=target).evaluate()(
+            x_data
+        )  # unique, indices, inverse_indices, num_unique, (counts)
+        np_res = calc_numpy_unique(
+            x_data, is_sorted
+        )  # unique, indices, inverse_indices, num_unique, counts
+        num_unique = np_res[3][0]
+
+        # num_unique
+        assert num_unique == tvm_res[3].numpy()[0]
+        # unique
+        tvm.testing.assert_allclose(tvm_res[0].numpy()[:num_unique], np_res[0], rtol=1e-5)
+        # indices
+        tvm.testing.assert_allclose(tvm_res[1].numpy()[:num_unique], np_res[1], rtol=1e-5)
+        # inverse_indices
+        tvm.testing.assert_allclose(tvm_res[2].numpy(), np_res[2], rtol=1e-5)
+        # counts
+        if return_counts:
+            tvm.testing.assert_allclose(tvm_res[4].numpy()[:num_unique], np_res[4], rtol=1e-5)
 
     for dtype in ["int32", "int64"]:
         for i in range(8):
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index b9bbef951555b..e46832d570e90 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -26,7 +26,7 @@
 from tvm.relay import transform
 from tvm.relay.testing import run_infer_type
 
-executor_kind = tvm.testing.parameter("graph", "debug")
+executor_kind = tvm.testing.parameter("graph", "vm")
 
 
 @tvm.testing.uses_gpu
@@ -153,14 +153,13 @@ def test_binary_int_broadcast_2():
 
 
 @tvm.testing.uses_gpu
-def test_where():
+def test_where(executor_kind):
     def run(func, inputs, ref_res):
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    *inputs
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                *inputs
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
     def verify(x_np, y_np, cond_np):
         ref_res = np.where(cond_np, x_np, y_np)
@@ -398,7 +397,7 @@ def get_test_case(shape, gt_func, test_argmin=False):
                 assert op_res.numpy().item() == ans
 
 
-def verify_mean_var_std(funcs, shape, axis, keepdims):
+def verify_mean_var_std(executor_kind, funcs, shape, axis, keepdims):
     test_func = funcs[0]
     ref_func = funcs[1]
     dtype = "float32"
@@ -411,27 +410,26 @@ def verify_mean_var_std(funcs, shape, axis, keepdims):
     ref_res = ref_func(x_data, axis=axis, dtype=dtype, keepdims=keepdims)
 
     for target, dev in tvm.testing.enabled_targets():
-        op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(x_data)
-        tvm.testing.assert_allclose(op_res1[0].numpy(), ref_mean, rtol=1e-5)
-        tvm.testing.assert_allclose(op_res1[1].numpy(), ref_res, rtol=1e-5)
-        op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(x_data)
-        tvm.testing.assert_allclose(op_res2[0].numpy(), ref_mean, rtol=1e-5)
-        tvm.testing.assert_allclose(op_res2[1].numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x_data
+        )
+        tvm.testing.assert_allclose(op_res[0].numpy(), ref_mean, rtol=1e-5)
+        tvm.testing.assert_allclose(op_res[1].numpy(), ref_res, rtol=1e-5)
 
 
 @tvm.testing.uses_gpu
-def test_mean_var_std():
+def test_mean_var_std(executor_kind):
     for func in [[relay.mean_variance, np.var], [relay.mean_std, np.std]]:
-        verify_mean_var_std(func, (2, 3, 4), 1, True)
-        verify_mean_var_std(func, (2, 3, 4), (1,), True)
-        verify_mean_var_std(func, (2, 3, 4), -1, True)
-        verify_mean_var_std(func, (2, 3, 4), (0, 1, 2), False)
-        verify_mean_var_std(func, (4, 4, 3), None, False)
-        verify_mean_var_std(func, (4, 4, 3), (0, 2), False)
-        verify_mean_var_std(func, (128, 24, 128), (0, 1), False)
-        verify_mean_var_std(func, (128, 24, 128), (0, 2), False)
-        verify_mean_var_std(func, (128, 24, 128), (0, 1), True)
-        verify_mean_var_std(func, (128, 24, 128), (0, 2), True)
+        verify_mean_var_std(executor_kind, func, (2, 3, 4), 1, True)
+        verify_mean_var_std(executor_kind, func, (2, 3, 4), (1,), True)
+        verify_mean_var_std(executor_kind, func, (2, 3, 4), -1, True)
+        verify_mean_var_std(executor_kind, func, (2, 3, 4), (0, 1, 2), False)
+        verify_mean_var_std(executor_kind, func, (4, 4, 3), None, False)
+        verify_mean_var_std(executor_kind, func, (4, 4, 3), (0, 2), False)
+        verify_mean_var_std(executor_kind, func, (128, 24, 128), (0, 1), False)
+        verify_mean_var_std(executor_kind, func, (128, 24, 128), (0, 2), False)
+        verify_mean_var_std(executor_kind, func, (128, 24, 128), (0, 1), True)
+        verify_mean_var_std(executor_kind, func, (128, 24, 128), (0, 2), True)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 10cd91415724f..af9c08409c010 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -29,7 +29,7 @@
 from tvm import relay, te
 from tvm.relay.testing import run_infer_type
 
-executor_kind = tvm.testing.parameter("graph", "debug")
+executor_kind = tvm.testing.parameter("graph", "vm")
 
 
 def test_resize1d_infer_type():
@@ -279,7 +279,7 @@ def test_crop_and_resize(self, target, dev, executor_kind, layout, interpolate_m
 
 
 @tvm.testing.uses_gpu
-def test_multibox_prior():
+def test_multibox_prior(executor_kind):
     def get_ref_result(
         dshape, sizes=(1.0,), ratios=(1.0,), steps=(-1.0, -1.0), offsets=(0.5, 0.5), clip=True
     ):
@@ -358,10 +358,10 @@ def verify_multibox_prior(
         func = relay.Function([x], z)
         func = run_infer_type(func)
         for target, dev in tvm.testing.enabled_targets():
-            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(data)
-            tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5)
-            op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(data)
-            tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5)
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                data
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
     sizes = (0.3, 1.5, 0.7)
     ratios = (1.3, 2.4)
@@ -415,7 +415,7 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
         func = relay.Function([x], z.astuple())
         func = run_infer_type(func)
         for target, dev in tvm.testing.enabled_targets():
-            out = relay.create_executor("debug", device=dev, target=target).evaluate(func)(np_data)
+            out = relay.create_executor("vm", device=dev, target=target).evaluate(func)(np_data)
 
             tvm.testing.assert_allclose(out[0].numpy(), np_out1, rtol=1e-3, atol=1e-04)
             tvm.testing.assert_allclose(out[1].numpy(), np_out2, rtol=1e-3, atol=1e-04)
@@ -428,7 +428,7 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
 
 
 @tvm.testing.uses_gpu
-def test_non_max_suppression():
+def test_non_max_suppression(executor_kind):
     def verify_nms(
         x0_data,
         x1_data,
@@ -486,22 +486,14 @@ def verify_nms(
         func_indices = relay.Function([x0, x1, x2, x3], z_indices)
         func_indices = run_infer_type(func_indices)
         for target, dev in tvm.testing.enabled_targets():
-            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                x0_data, x1_data, x2_data, x3_data
-            )
-            tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5)
-            op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
                 x0_data, x1_data, x2_data, x3_data
             )
-            tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5)
-            op_indices_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(
-                func_indices
-            )(x0_data, x1_data, x2_data, x3_data)
-            tvm.testing.assert_allclose(op_indices_res1[0].numpy(), ref_indices_res, rtol=1e-5)
-            op_indices_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(
-                func_indices
-            )(x0_data, x1_data, x2_data, x3_data)
-            tvm.testing.assert_allclose(op_indices_res2[0].numpy(), ref_indices_res, rtol=1e-5)
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+            op_indices_res = relay.create_executor(
+                executor_kind, device=dev, target=target
+            ).evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data)
+            tvm.testing.assert_allclose(op_indices_res[0].numpy(), ref_indices_res, rtol=1e-5)
 
     np_data = np.array(
         [
@@ -633,7 +625,7 @@ def verify_nms(
 
 
 @tvm.testing.uses_gpu
-def test_multibox_transform_loc():
+def test_multibox_transform_loc(executor_kind):
     def test_default_value():
         num_anchors = 3
         num_classes = 3
@@ -683,14 +675,10 @@ def test_default_value():
         func = relay.Function([cls_prob, loc_pred, anchors], nms)
         func = run_infer_type(func)
         for target, dev in tvm.testing.enabled_targets():
-            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                np_cls_prob, np_loc_preds, np_anchors
-            )
-            tvm.testing.assert_allclose(op_res1.numpy(), expected_np_out, rtol=1e-5)
-            op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
                 np_cls_prob, np_loc_preds, np_anchors
             )
-            tvm.testing.assert_allclose(op_res2.numpy(), expected_np_out, rtol=1e-5)
+            tvm.testing.assert_allclose(op_res.numpy(), expected_np_out, rtol=1e-5)
 
     def test_threshold():
         num_anchors = 5
@@ -727,7 +715,7 @@ def test_threshold():
 
 
 @tvm.testing.uses_gpu
-def test_roi_align():
+def test_roi_align(executor_kind):
     def verify_roi_align(
         data_shape,
         rois_shape,
@@ -778,14 +766,10 @@ def verify_roi_align(
             mode=mode,
         )
         for target, dev in tvm.testing.enabled_targets():
-            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
                 np_data, np_rois
             )
-            tvm.testing.assert_allclose(op_res1.numpy(), ref_res, atol=1e-6, rtol=1e-3)
-            op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(
-                np_data, np_rois
-            )
-            tvm.testing.assert_allclose(op_res2.numpy(), ref_res, atol=1e-6, rtol=1e-3)
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, atol=1e-6, rtol=1e-3)
 
     def verify_roi_align_nchw(
         data_shape, rois_shape, pooled_size, spatial_scale, sample_ratio, mode
@@ -848,7 +832,7 @@ def verify_roi_align_nhwc(
 
 
 @tvm.testing.uses_gpu
-def test_roi_pool():
+def test_roi_pool(executor_kind):
     def verify_roi_pool(data_shape, rois_shape, pooled_size, spatial_scale):
         data = relay.var("data", relay.ty.TensorType(data_shape, "float32"))
         rois = relay.var("rois", relay.ty.TensorType(rois_shape, "float32"))
@@ -875,21 +859,17 @@ def verify_roi_pool(data_shape, rois_shape, pooled_size, spatial_scale):
             np_data, np_rois, pooled_size=pooled_size, spatial_scale=spatial_scale
         )
         for target, dev in tvm.testing.enabled_targets():
-            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                np_data, np_rois
-            )
-            tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-4)
-            op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
                 np_data, np_rois
             )
-            tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-4)
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4)
 
     verify_roi_pool((1, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=1.0)
     verify_roi_pool((4, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=0.5)
 
 
 @tvm.testing.uses_gpu
-def test_proposal():
+def test_proposal(executor_kind):
     def verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs):
         cls_prob = relay.var("cls_prob", relay.ty.TensorType(np_cls_prob.shape, "float32"))
         bbox_pred = relay.var("bbox_pred", relay.ty.TensorType(np_bbox_pred.shape, "float32"))
@@ -905,14 +885,10 @@ def verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs):
                 print("Skip test because %s is not enabled." % target)
                 continue
             dev = tvm.device(target, 0)
-            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
                 np_cls_prob, np_bbox_pred, np_im_info
             )
-            tvm.testing.assert_allclose(op_res1.numpy(), np_out, rtol=1e-4)
-            op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(
-                np_cls_prob, np_bbox_pred, np_im_info
-            )
-            tvm.testing.assert_allclose(op_res2.numpy(), np_out, rtol=1e-4)
+            tvm.testing.assert_allclose(op_res.numpy(), np_out, rtol=1e-4)
 
     attrs = {
         "scales": (0.5,),
@@ -986,7 +962,7 @@ def verify_yolo_reorg(shape, stride, out_shape):
 
 
 @tvm.testing.uses_gpu
-def test_yolo_reorg():
+def test_yolo_reorg(executor_kind):
     def verify_yolo_reorg(shape, stride):
         x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
         ref_res = tvm.topi.testing.reorg_python(x_data, stride)
@@ -1000,11 +976,10 @@ def verify_yolo_reorg(shape, stride):
         func = relay.Function([x], z)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                x_data
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
     verify_yolo_reorg((1, 100, 20, 20), 10)
     verify_yolo_reorg((1, 4, 6, 6), 2)
@@ -1155,7 +1130,7 @@ def test_run(
 
 
 @tvm.testing.uses_gpu
-def test_depth_to_space():
+def test_depth_to_space(executor_kind):
     def verify_depth_to_space(dshape, block_size, layout, mode):
         if layout == "NHWC":
             out_shape = [
@@ -1188,11 +1163,10 @@ def verify_depth_to_space(dshape, block_size, layout, mode):
         func = relay.Function([x], z)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4)
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                x_data
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4)
 
     for layout in ["NHWC", "NCHW"]:
         for mode in ["DCR", "CDR"]:
@@ -1200,7 +1174,7 @@ def verify_depth_to_space(dshape, block_size, layout, mode):
 
 
 @tvm.testing.uses_gpu
-def test_space_to_depth():
+def test_space_to_depth(executor_kind):
     def verify_space_to_depth(dshape, block_size, layout):
         if layout == "NHWC":
             out_shape = [
@@ -1233,11 +1207,10 @@ def verify_space_to_depth(dshape, block_size, layout):
         func = relay.Function([x], z)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4)
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                x_data
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4)
 
     for layout in ["NHWC", "NCHW"]:
         verify_space_to_depth((1, 4, 4, 4), 2, layout)
@@ -1369,7 +1342,7 @@ def test_dilation2d(
 
 
 @tvm.testing.uses_gpu
-def test_affine_grid():
+def test_affine_grid(executor_kind):
     def verify_affine_grid(num_batch, target_shape):
         dtype = "float32"
         data_shape = (num_batch, 2, 3)
@@ -1385,18 +1358,17 @@ def verify_affine_grid(num_batch, target_shape):
         ref_res = tvm.topi.testing.affine_grid_python(data_np, target_shape)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res1 = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    data_np
-                )
-                tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                data_np
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5, atol=1e-5)
 
     verify_affine_grid(1, (16, 32))
     verify_affine_grid(4, (16, 32))
 
 
 @tvm.testing.uses_gpu
-def test_grid_sample():
+def test_grid_sample(executor_kind):
     def verify_grid_sample(
         data_shape, grid_shape, method="bilinear", padding_mode="zeros", align_corners=True
     ):
@@ -1436,11 +1408,10 @@ def verify_grid_sample(
         )
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res1 = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    data_np, grid_np
-                )
-                tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                data_np, grid_np
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5, atol=1e-5)
 
     methods = ["nearest", "bilinear", "bicubic"]
     padding_modes = ["zeros", "border", "reflection"]
@@ -1462,7 +1433,7 @@ def verify_grid_sample(
 
 
 @tvm.testing.uses_gpu
-def test_space_to_batch_nd():
+def test_space_to_batch_nd(executor_kind):
     def verify_space_to_batch_nd(dshape, block_shape, paddings):
         x_data = np.random.uniform(size=dshape).astype("float32")
         pad_before, pad_after = map(list, zip(*paddings))
@@ -1479,18 +1450,17 @@ def verify_space_to_batch_nd(dshape, block_shape, paddings):
         func = relay.Function([x], z)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4)
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                x_data
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4)
 
     verify_space_to_batch_nd([3, 3, 2, 1], [3], [[0, 0]])
     verify_space_to_batch_nd([2, 2, 4, 1], [2, 2], [[0, 0], [2, 0]])
 
 
 @tvm.testing.uses_gpu
-def test_batch_to_space_nd():
+def test_batch_to_space_nd(executor_kind):
     def verify_batch_to_space_nd(dshape, block_shape, crops):
         x_data = np.random.uniform(size=dshape).astype("float32")
         crop_begin_list, crop_end_list = map(list, zip(*crops))
@@ -1507,18 +1477,17 @@ def verify_batch_to_space_nd(dshape, block_shape, crops):
         func = relay.Function([x], z)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4)
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                x_data
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4)
 
     verify_batch_to_space_nd([4, 1, 1, 3], [2, 2], [[0, 0], [0, 0]])
     verify_batch_to_space_nd([8, 1, 3, 1], [2, 2], [[0, 0], [2, 0]])
 
 
 @tvm.testing.uses_gpu
-def test_all_class_non_max_suppression():
+def test_all_class_non_max_suppression(executor_kind):
     def verify_all_class_non_max_suppression(
         boxes_np,
         scores_np,
@@ -1542,12 +1511,11 @@ def verify_all_class_non_max_suppression(
         func = run_infer_type(func)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                selected_indices, num_detections = relay.create_executor(
-                    kind, device=dev, target=target
-                ).evaluate(func)(boxes_np, scores_np)
-                tvm_res = selected_indices.numpy()[: num_detections.numpy()[0]]
-                np.testing.assert_equal(tvm_res, expected_indices)
+            selected_indices, num_detections = relay.create_executor(
+                executor_kind, device=dev, target=target
+            ).evaluate(func)(boxes_np, scores_np)
+            tvm_res = selected_indices.numpy()[: num_detections.numpy()[0]]
+            np.testing.assert_equal(tvm_res, expected_indices)
 
     boxes = np.array(
         [
diff --git a/tests/python/relay/test_op_level6.py b/tests/python/relay/test_op_level6.py
index 48c58dc2dc33a..78db5b87385d5 100644
--- a/tests/python/relay/test_op_level6.py
+++ b/tests/python/relay/test_op_level6.py
@@ -23,6 +23,8 @@
 from tvm.topi.testing import searchsorted_ref
 import tvm.testing
 
+executor_kind = tvm.testing.parameter("graph", "vm")
+
 
 @tvm.testing.uses_gpu
 def test_sort():
@@ -40,16 +42,15 @@ def verify_sort(shape, axis, is_ascend, is_dyn=False, in_dtype="float32"):
             ref_res = -np.sort(-x_data, axis=axis)
 
         if is_dyn:
-            backends = ["vm", "debug"]
+            backend = "vm"
         else:
-            backends = ["graph", "debug"]
+            backend = "graph"
         for target, dev in tvm.testing.enabled_targets():
-            for kind in backends:
-                mod = tvm.ir.IRModule.from_expr(func)
-                op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()(
-                    x_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+            mod = tvm.ir.IRModule.from_expr(func)
+            op_res = relay.create_executor(backend, mod=mod, device=dev, target=target).evaluate()(
+                x_data
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
     for is_dyn in [False, True]:
         verify_sort((2, 3, 4), axis=0, is_ascend=False, is_dyn=is_dyn)
@@ -76,16 +77,15 @@ def verify_argsort(shape, axis, is_ascend, dtype, is_dyn=False, in_dtype="float3
             ref_res = np.argsort(-x_data, axis=axis, kind="stable")
 
         if is_dyn:
-            backends = ["vm", "debug"]
+            backend = "vm"
         else:
-            backends = ["graph", "debug"]
+            backend = "graph"
         for target, dev in tvm.testing.enabled_targets():
-            for kind in backends:
-                mod = tvm.ir.IRModule.from_expr(func)
-                op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()(
-                    x_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res.astype(dtype), rtol=1e-5)
+            mod = tvm.ir.IRModule.from_expr(func)
+            op_res = relay.create_executor(backend, mod=mod, device=dev, target=target).evaluate()(
+                x_data
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res.astype(dtype), rtol=1e-5)
 
     for is_dyn in [False, True]:
         for dtype in ["int32", "int64", "float32", "float64"]:
@@ -102,7 +102,7 @@ def verify_argsort(shape, axis, is_ascend, dtype, is_dyn=False, in_dtype="float3
 
 
 @tvm.testing.uses_gpu
-def test_topk():
+def test_topk(executor_kind):
     def verify_topk(k, axis, ret_type, is_ascend, dtype, in_dtype="float32"):
         shape = (20, 100)
         x = relay.var("x", relay.TensorType(shape, in_dtype))
@@ -129,17 +129,16 @@ def verify_topk(k, axis, ret_type, is_ascend, dtype, in_dtype="float32"):
         np_indices = np_indices.astype(dtype)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    np_data
-                )
-                if ret_type == "both":
-                    tvm.testing.assert_allclose(op_res[0].numpy(), np_values)
-                    tvm.testing.assert_allclose(op_res[1].numpy(), np_indices)
-                elif ret_type == "values":
-                    tvm.testing.assert_allclose(op_res.numpy(), np_values)
-                else:
-                    tvm.testing.assert_allclose(op_res.numpy(), np_indices)
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                np_data
+            )
+            if ret_type == "both":
+                tvm.testing.assert_allclose(op_res[0].numpy(), np_values)
+                tvm.testing.assert_allclose(op_res[1].numpy(), np_indices)
+            elif ret_type == "values":
+                tvm.testing.assert_allclose(op_res.numpy(), np_values)
+            else:
+                tvm.testing.assert_allclose(op_res.numpy(), np_indices)
 
     np.random.seed(0)
     for k in [0, 1, 5]:

From 6247bf48aaa59be9549dd8c342702c6005f16c5f Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Mon, 23 May 2022 11:59:02 +0100
Subject: [PATCH 58/59] [CMSIS-NN] Aligned buffer sizes for Conv2D post
 CMSIS-NN SHA update (#11359)

---
 .../backend/contrib/cmsisnn/buffer_size.cc    | 18 +++++---
 .../backend/contrib/cmsisnn/buffer_size.h     |  3 +-
 .../backend/contrib/cmsisnn/relay_to_tir.cc   |  6 +--
 .../contrib/cmsisnn/buffer_size_test.cc       | 41 ++++++++++---------
 tests/python/relay/aot/test_crt_aot.py        |  2 +-
 5 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/src/relay/backend/contrib/cmsisnn/buffer_size.cc b/src/relay/backend/contrib/cmsisnn/buffer_size.cc
index 2502a09e75d67..b6b98c0fc34f0 100644
--- a/src/relay/backend/contrib/cmsisnn/buffer_size.cc
+++ b/src/relay/backend/contrib/cmsisnn/buffer_size.cc
@@ -29,24 +29,30 @@ namespace cmsisnn {
 
 int Conv2dBufferSize(CMSISNNFlags flags, int32_t padding_w, int32_t padding_h, int32_t input_n,
                      int32_t input_h, int32_t input_c, int32_t output_h, int32_t output_w,
-                     int32_t stride_w, int32_t stride_h, int32_t filter_w, int32_t filter_h) {
+                     int32_t stride_w, int32_t stride_h, int32_t dilation_w, int32_t dilation_h,
+                     int32_t filter_w, int32_t filter_h) {
   bool is1x1 = (padding_w == 0) && (padding_h == 0) && (input_c % 4 == 0) && (stride_w == 1) &&
-               (stride_h == 1) && (filter_w == 1) && (filter_h == 1);
-  bool is1xN =
-      (output_h == 1) && (input_h == 1) && (filter_h == 1) && (output_w % 4 == 0) && (input_n == 1);
+               (stride_h == 1) && (filter_w == 1) && (filter_h == 1) && (dilation_w == 1) &&
+               (dilation_h == 1);
+  bool is1xN = (output_h == 1) && (input_h == 1) && (filter_h == 1) && (output_w % 4 == 0) &&
+               (input_n == 1) && (dilation_w == 1) && (dilation_h == 1);
 
   if (is1x1) {
     return 0;
   }
 
   if (is1xN) {
-    if (flags.dsp && !flags.mve) {
+    if (!flags.mve) {
       return (2 * input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t);
     }
     return 0;
   }
 
-  if (flags.dsp) {
+  if (flags.mve) {
+    int32_t col_length = input_c * filter_w * filter_h;
+    col_length = (col_length + 7) / 8;
+    return 4 * col_length * 8 * (int32_t)sizeof(int8_t);
+  } else {
     return (2 * input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t);
   }
   return 0;
diff --git a/src/relay/backend/contrib/cmsisnn/buffer_size.h b/src/relay/backend/contrib/cmsisnn/buffer_size.h
index dec3c3eafc485..e89763fd5a2d4 100644
--- a/src/relay/backend/contrib/cmsisnn/buffer_size.h
+++ b/src/relay/backend/contrib/cmsisnn/buffer_size.h
@@ -56,7 +56,8 @@ namespace cmsisnn {
  */
 int Conv2dBufferSize(CMSISNNFlags flags, int32_t padding_w, int32_t padding_h, int32_t input_n,
                      int32_t input_h, int32_t input_c, int32_t output_h, int32_t output_w,
-                     int32_t stride_w, int32_t stride_h, int32_t filter_w, int32_t filter_h);
+                     int32_t stride_w, int32_t stride_h, int32_t dilation_w, int32_t dilation_h,
+                     int32_t filter_w, int32_t filter_h);
 
 /*!
  * \brief Calculates the appropriate buffer size for CMSIS-NN Depthwise Convolutions
diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
index 210175817f9c1..dc5537ee905d8 100644
--- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
+++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
@@ -238,9 +238,9 @@ class RelayToTIRVisitor : public MixedModeMutator {
       context_buffer_size =
           DepthwiseConv2dBufferSize(flags, input_n, input_c, output_c, filter_w, filter_h);
     } else {
-      context_buffer_size =
-          Conv2dBufferSize(flags, padding_w, padding_h, input_n, input_h, input_c, output_h,
-                           output_w, stride_w, stride_h, filter_w, filter_h);
+      context_buffer_size = Conv2dBufferSize(flags, padding_w, padding_h, input_n, input_h, input_c,
+                                             output_h, output_w, stride_w, stride_h, dilation_w,
+                                             dilation_h, filter_w, filter_h);
     }
 
     if (context_buffer_size) {
diff --git a/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc b/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc
index 7b8047a3b2941..b7458858d4aba 100644
--- a/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc
+++ b/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc
@@ -44,7 +44,7 @@ class CMSISNNCalculatedBufferSize : public testing::TestWithParam<std::array<int
 TEST(CMSISNNConv2dBufferSize, Conv1x1) {
   int32_t any = fake_parameters(gen);
   auto conv2d_1x1 = [=](CMSISNNFlags flags, int32_t input_c) {
-    return Conv2dBufferSize(flags, 0, 0, any, any, input_c, any, any, 1, 1, 1, 1);
+    return Conv2dBufferSize(flags, 0, 0, any, any, input_c, any, any, 1, 1, 1, 1, 1, 1);
   };
 
   ASSERT_EQ(conv2d_1x1(kNoExt, 4), 0);
@@ -74,15 +74,15 @@ TEST(CMSISNNConv2dBufferSize, Conv1xN) {
   int32_t calculated_buffer = (2 * input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t);
 
   auto conv2d_1xn = [=](CMSISNNFlags flags, int32_t output_w) {
-    return Conv2dBufferSize(flags, any, any, 1, 1, input_c, 1, output_w, any, any, filter_w,
+    return Conv2dBufferSize(flags, any, any, 1, 1, input_c, 1, output_w, any, any, 1, 1, filter_w,
                             filter_h);
   };
 
-  ASSERT_EQ(conv2d_1xn(kNoExt, 4), 0);
-  ASSERT_EQ(conv2d_1xn(kNoExt, 8), 0);
-  ASSERT_EQ(conv2d_1xn(kNoExt, 12), 0);
-  ASSERT_EQ(conv2d_1xn(kNoExt, 16), 0);
-  ASSERT_EQ(conv2d_1xn(kNoExt, 32), 0);
+  ASSERT_EQ(conv2d_1xn(kNoExt, 4), calculated_buffer);
+  ASSERT_EQ(conv2d_1xn(kNoExt, 8), calculated_buffer);
+  ASSERT_EQ(conv2d_1xn(kNoExt, 12), calculated_buffer);
+  ASSERT_EQ(conv2d_1xn(kNoExt, 16), calculated_buffer);
+  ASSERT_EQ(conv2d_1xn(kNoExt, 32), calculated_buffer);
 
   ASSERT_EQ(conv2d_1xn(kHasDSP, 4), calculated_buffer);
   ASSERT_EQ(conv2d_1xn(kHasDSP, 8), calculated_buffer);
@@ -104,17 +104,20 @@ TEST(CMSISNNConv2dBufferSize, Default) {
   int32_t filter_w = fake_parameters(gen);
   int32_t filter_h = fake_parameters(gen);
   int32_t calculated_buffer = (2 * input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t);
+  int32_t col_length = input_c * filter_w * filter_h;
+  col_length = (col_length + 7) / 8;
+  int32_t calculated_buffer_mve = 4 * col_length * 8 * (int32_t)sizeof(int8_t);
 
   auto conv2d = [=](CMSISNNFlags flags, int32_t output_w) {
-    return Conv2dBufferSize(flags, any, any, 1, 1, input_c, 1, output_w, any, any, filter_w,
-                            filter_h);
+    return Conv2dBufferSize(flags, any, any, 1, 1, input_c, 1, output_w, any, any, any, any,
+                            filter_w, filter_h);
   };
 
-  ASSERT_EQ(conv2d(kNoExt, 4), 0);
-  ASSERT_EQ(conv2d(kNoExt, 8), 0);
-  ASSERT_EQ(conv2d(kNoExt, 12), 0);
-  ASSERT_EQ(conv2d(kNoExt, 16), 0);
-  ASSERT_EQ(conv2d(kNoExt, 32), 0);
+  ASSERT_EQ(conv2d(kNoExt, 4), calculated_buffer);
+  ASSERT_EQ(conv2d(kNoExt, 8), calculated_buffer);
+  ASSERT_EQ(conv2d(kNoExt, 12), calculated_buffer);
+  ASSERT_EQ(conv2d(kNoExt, 16), calculated_buffer);
+  ASSERT_EQ(conv2d(kNoExt, 32), calculated_buffer);
 
   ASSERT_EQ(conv2d(kHasDSP, 4), calculated_buffer);
   ASSERT_EQ(conv2d(kHasDSP, 8), calculated_buffer);
@@ -122,11 +125,11 @@ TEST(CMSISNNConv2dBufferSize, Default) {
   ASSERT_EQ(conv2d(kHasDSP, 16), calculated_buffer);
   ASSERT_EQ(conv2d(kHasDSP, 32), calculated_buffer);
 
-  ASSERT_EQ(conv2d(kHasMVE, 4), calculated_buffer);
-  ASSERT_EQ(conv2d(kHasMVE, 8), calculated_buffer);
-  ASSERT_EQ(conv2d(kHasMVE, 12), calculated_buffer);
-  ASSERT_EQ(conv2d(kHasMVE, 16), calculated_buffer);
-  ASSERT_EQ(conv2d(kHasMVE, 32), calculated_buffer);
+  ASSERT_EQ(conv2d(kHasMVE, 4), calculated_buffer_mve);
+  ASSERT_EQ(conv2d(kHasMVE, 8), calculated_buffer_mve);
+  ASSERT_EQ(conv2d(kHasMVE, 12), calculated_buffer_mve);
+  ASSERT_EQ(conv2d(kHasMVE, 16), calculated_buffer_mve);
+  ASSERT_EQ(conv2d(kHasMVE, 32), calculated_buffer_mve);
 }
 
 TEST(CMSISNNDepthwiseConv2dBufferSize, UnEvenChannels) {
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index d1d80d434b6a6..ffae70d0cf817 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -992,7 +992,7 @@ def test_workspace_calculation_cmsis_nn():
     ):
         lib = tvm.relay.build(mod, target, executor=executor, runtime=runtime, params=params)
     mlf_memory_map = mlf._build_function_memory_map(lib.function_metadata)
-    assert mlf_memory_map["main"][0]["workspace_size_bytes"] == 9904
+    assert mlf_memory_map["main"][0]["workspace_size_bytes"] == 14384
 
 
 def test_aot_codegen_checks_returns():

From 4626a61fe2c9213e156316df300de5a3228b2d11 Mon Sep 17 00:00:00 2001
From: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Date: Mon, 23 May 2022 23:00:52 +0800
Subject: [PATCH 59/59] [TVMScript] fix typo for block syntax (#11407)

---
 python/tvm/script/parser.py                   |  4 +-
 python/tvm/tir/schedule/schedule.py           |  2 +-
 ...est_tir_transform_compact_buffer_region.py | 62 +++++++++----------
 .../test_tir_transform_flatten_buffer.py      | 22 +++----
 .../unittest/test_tvmscript_complete.py       | 12 ++--
 .../unittest/test_tvmscript_roundtrip.py      |  2 +-
 6 files changed, 52 insertions(+), 52 deletions(-)

diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py
index daeb018ea9899..a376cb7eb08d4 100644
--- a/python/tvm/script/parser.py
+++ b/python/tvm/script/parser.py
@@ -786,9 +786,9 @@ def transform_With(self, node):
             withitem = (expr context_expr, expr? optional_vars)
         By now 2 patterns of With is supported:
             1. with scope handler with symbol def
-                with T.block(*axes)/T.allocate() as targets:
+                with T.allocate() as targets:
             2. with scope handler without symbol def
-                with T.let()/T.Assert()/T.attr()/T.realize()
+                with T.block(*axes)/T.let()/T.Assert()/T.attr()/T.realize()
         """
 
         if not isinstance(node.rhs, ast.Call):
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index 8bfd9063158c0..6474ba0baa3dd 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -592,7 +592,7 @@ def before_split(a: T.handle, b: T.handle) -> None:
                 A = T.match_buffer(a, (128, 128))
                 B = T.match_buffer(b, (128, 128))
                 for i, j in T.grid(128, 128):
-                    with T.block("B") as [vi, vj]:
+                    with T.block("B"):
                         vi, vj = T.axis.remap("SS", [i, j])
                         B[vi, vj] = A[vi, vj] * 2.0
 
diff --git a/tests/python/unittest/test_tir_transform_compact_buffer_region.py b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
index 8ad95bd4bc0cf..3e538e27a494f 100644
--- a/tests/python/unittest/test_tir_transform_compact_buffer_region.py
+++ b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
@@ -40,12 +40,12 @@ def elementwise_func(a: T.handle, c: T.handle) -> None:
             T.writes(C[i, 0:16])
             B = T.alloc_buffer((16, 16), "float32")
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     T.reads(A[i, j])
                     T.writes(B[i, j])
                     B[i, j] = A[i, j] + 1.0
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     T.reads(B[i, j])
                     T.writes(C[i, j])
                     C[i, j] = B[i, j] * 2.0
@@ -61,12 +61,12 @@ def compacted_elementwise_func(a: T.handle, c: T.handle) -> None:
             T.writes(C[i, 0:16])
             B = T.alloc_buffer((1, 16), "float32")
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     T.reads(A[i, j])
                     T.writes(B[0, j])
                     B[0, j] = A[i, j] + 1.0
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     T.reads(B[0, j])
                     T.writes(C[i, j])
                     C[i, j] = B[0, j] * 2.0
@@ -97,7 +97,7 @@ def param_buffer_access_func(a: T.handle, c: T.handle) -> None:
             T.reads(A[i, 0:16])
             T.writes(B[i, 0:16])
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     T.reads(A[i, j])
                     T.writes(B[i, j])
                     B[i, j] = A[i, j] + 1.0
@@ -115,12 +115,12 @@ def shared_mem_func(a: T.handle, c: T.handle) -> None:
                     T.writes(C[i0 * 8 + i1 * 4 + i2, 0:16])
                     B = T.alloc_buffer((16, 16), "float32", scope="shared")
                     for j in range(0, 16):
-                        with T.block() as []:
+                        with T.block():
                             T.reads(A[i0 * 8 + i1 * 4 + i2, j])
                             T.writes(B[i0 * 8 + i1 * 4 + i2, j])
                             B[i0 * 8 + i1 * 4 + i2, j] = A[i0 * 8 + i1 * 4 + i2, j] + 1.0
                     for j in range(0, 16):
-                        with T.block() as []:
+                        with T.block():
                             T.reads(B[i0 * 8 + i1 * 4 + i2, j])
                             T.writes(C[i0 * 8 + i1 * 4 + i2, j])
                             C[i0 * 8 + i1 * 4 + i2, j] = B[i0 * 8 + i1 * 4 + i2, j] * 2.0
@@ -138,12 +138,12 @@ def compacted_shared_mem_func(a: T.handle, c: T.handle) -> None:
                     T.writes(C[i0 * 8 + i1 * 4 + i2, 0:16])
                     B = T.alloc_buffer((8, 16), "float32", scope="shared")
                     for j in range(0, 16):
-                        with T.block() as []:
+                        with T.block():
                             T.reads(A[i0 * 8 + i1 * 4 + i2, j])
                             T.writes(B[i1 * 4 + i2, j])
                             B[i1 * 4 + i2, j] = A[i0 * 8 + i1 * 4 + i2, j] + 1.0
                     for j in range(0, 16):
-                        with T.block() as []:
+                        with T.block():
                             T.reads(B[i1 * 4 + i2, j])
                             T.writes(C[i0 * 8 + i1 * 4 + i2, j])
                             C[i0 * 8 + i1 * 4 + i2, j] = B[i1 * 4 + i2, j] * 2.0
@@ -161,12 +161,12 @@ def warp_mem_func(a: T.handle, c: T.handle) -> None:
                     T.writes(C[i0 * 8 + i1 * 4 + i2, 0:16])
                     B = T.alloc_buffer((16, 16), "float32", scope="warp")
                     for j in range(0, 16):
-                        with T.block() as []:
+                        with T.block():
                             T.reads(A[i0 * 8 + i1 * 4 + i2, j])
                             T.writes(B[i0 * 8 + i1 * 4 + i2, j])
                             B[i0 * 8 + i1 * 4 + i2, j] = A[i0 * 8 + i1 * 4 + i2, j] + 1.0
                     for j in range(0, 16):
-                        with T.block() as []:
+                        with T.block():
                             T.reads(B[i0 * 8 + i1 * 4 + i2, j])
                             T.writes(C[i0 * 8 + i1 * 4 + i2, j])
                             C[i0 * 8 + i1 * 4 + i2, j] = B[i0 * 8 + i1 * 4 + i2, j] * 2.0
@@ -184,12 +184,12 @@ def compacted_warp_mem_func(a: T.handle, c: T.handle) -> None:
                     T.writes(C[i0 * 8 + i1 * 4 + i2, 0:16])
                     B = T.alloc_buffer((4, 16), "float32", scope="warp")
                     for j in range(0, 16):
-                        with T.block() as []:
+                        with T.block():
                             T.reads(A[i0 * 8 + i1 * 4 + i2, j])
                             T.writes(B[i2, j])
                             B[i2, j] = A[i0 * 8 + i1 * 4 + i2, j] + 1.0
                     for j in range(0, 16):
-                        with T.block() as []:
+                        with T.block():
                             T.reads(B[i2, j])
                             T.writes(C[i0 * 8 + i1 * 4 + i2, j])
                             C[i0 * 8 + i1 * 4 + i2, j] = B[i2, j] * 2.0
@@ -205,12 +205,12 @@ def symbolic_func(a: T.handle, c: T.handle, n: T.int32) -> None:
             T.writes(C[i * 8 : i * 8 + 8])
             B = T.alloc_buffer((n * 8,), "float32")
             for j in range(0, 8):
-                with T.block() as []:
+                with T.block():
                     T.reads(A[i * 8 + j])
                     T.writes(B[i * 8 + j])
                     B[i * 8 + j] = A[i * 8 + j] + 1.0
             for j in range(0, 8):
-                with T.block() as []:
+                with T.block():
                     T.reads(B[i * 8 + j])
                     T.writes(C[i * 8 + j])
                     C[i * 8 + j] = B[i * 8 + j] * 2.0
@@ -226,12 +226,12 @@ def compacted_symbolic_func(a: T.handle, c: T.handle, n: T.int32) -> None:
             T.writes(C[i * 8 : i * 8 + 8])
             B = T.alloc_buffer((T.min(n, 1) * 8,), "float32")
             for j in range(0, 8):
-                with T.block() as []:
+                with T.block():
                     T.reads(A[i * 8 + j])
                     T.writes(B[j])
                     B[j] = A[i * 8 + j] + 1.0
             for j in range(0, 8):
-                with T.block() as []:
+                with T.block():
                     T.reads(B[j])
                     T.writes(C[i * 8 + j])
                     C[i * 8 + j] = B[j] * 2.0
@@ -247,7 +247,7 @@ def complex_func(a: T.handle, c: T.handle, n: T.int32) -> None:
             T.writes(C[0, 8])
             B = T.alloc_buffer((8, 8), "float32")
             for j in range(0, 4):
-                with T.block() as []:
+                with T.block():
                     D = T.alloc_buffer((8, 8), "float32")
                     T.reads(A[i, j])
                     T.writes(B[i, j])
@@ -256,12 +256,12 @@ def complex_func(a: T.handle, c: T.handle, n: T.int32) -> None:
                     for k in range(2, 4):
                         B[i, j] = A[i, j] + D[k, j]
             for j in range(3, 5):
-                with T.block() as []:
+                with T.block():
                     T.reads(B[i, j])
                     T.writes(C[i, j])
                     C[i, j] = B[i, j]
             for j in range(6, 8):
-                with T.block() as []:
+                with T.block():
                     T.reads(B[i, j])
                     T.writes(C[i, j])
                     C[i, j] = B[i, j]
@@ -277,7 +277,7 @@ def compacted_complex_func(a: T.handle, c: T.handle, n: T.int32) -> None:
             T.writes(C[0, 8])
             B = T.alloc_buffer((1, 8), "float32")
             for j in range(0, 4):
-                with T.block() as []:
+                with T.block():
                     D = T.alloc_buffer((6, 1), "float32")
                     T.reads(A[i, j])
                     T.writes(B[0, j])
@@ -286,12 +286,12 @@ def compacted_complex_func(a: T.handle, c: T.handle, n: T.int32) -> None:
                     for k in range(2, 4):
                         B[0, j] = A[i, j] + D[k - 2, 0]
             for j in range(3, 5):
-                with T.block() as []:
+                with T.block():
                     T.reads(B[0, j])
                     T.writes(C[i, j])
                     C[i, j] = B[0, j]
             for j in range(6, 8):
-                with T.block() as []:
+                with T.block():
                     T.reads(B[0, j])
                     T.writes(C[i, j])
                     C[i, j] = B[0, j]
@@ -309,12 +309,12 @@ def match_buffer_func(a: T.handle, c: T.handle) -> None:
             with T.block():
                 B0 = T.match_buffer(B[i, 0:16], (16))
                 for j in range(0, 16):
-                    with T.block() as []:
+                    with T.block():
                         A1 = T.match_buffer(A0[j], ())
                         B1 = T.match_buffer(B0[j], ())
                         B1[()] = A1[()] + 1.0
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     C1 = T.match_buffer(C0[j], ())
                     B2 = T.match_buffer(B[i, j], ())
                     C1[()] = B2[()] * 2.0
@@ -332,12 +332,12 @@ def compacted_match_buffer_func(a: T.handle, c: T.handle) -> None:
             with T.block():
                 B0 = T.match_buffer(B[0, 0:16], (16))
                 for j in range(0, 16):
-                    with T.block() as []:
+                    with T.block():
                         A1 = T.match_buffer(A0[j], ())
                         B1 = T.match_buffer(B0[j], ())
                         B1[()] = A1[()] + 1.0
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     C1 = T.match_buffer(C0[j], ())
                     B2 = T.match_buffer(B[0, j], ())
                     C1[()] = B2[()] * 2.0
@@ -353,13 +353,13 @@ def storage_align_func(a: T.handle, c: T.handle) -> None:
             T.writes(C[i, 0:16])
             B = T.alloc_buffer((16, 16), "float32")
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     T.reads(A[i, j])
                     T.writes(B[i, j])
                     T.block_attr({"buffer_dim_align": [[0, 0, 16, 15]]})
                     B[i, j] = A[i, j] + 1.0
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     T.reads(B[i, j])
                     T.writes(C[i, j])
                     C[i, j] = B[i, j] * 2.0
@@ -375,13 +375,13 @@ def compacted_storage_align_func(a: T.handle, c: T.handle) -> None:
             T.writes(C[i, 0:16])
             B = T.alloc_buffer((1, 16), strides=(31, 1), dtypes="float32")
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     T.reads(A[i, j])
                     T.writes(B[0, j])
                     T.block_attr({"buffer_dim_align": [[0, 0, 16, 15]]})
                     B[0, j] = A[i, j] + 1.0
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     T.reads(B[0, j])
                     T.writes(C[i, j])
                     C[i, j] = B[0, j] * 2.0
diff --git a/tests/python/unittest/test_tir_transform_flatten_buffer.py b/tests/python/unittest/test_tir_transform_flatten_buffer.py
index 68b1ad3389645..65be43aba3212 100644
--- a/tests/python/unittest/test_tir_transform_flatten_buffer.py
+++ b/tests/python/unittest/test_tir_transform_flatten_buffer.py
@@ -37,12 +37,12 @@ def compacted_elementwise_func(a: T.handle, c: T.handle) -> None:
             T.writes(C[i, 0:16])
             B = T.alloc_buffer([1, 16], "float32", scope="global")
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     T.reads(A[i, j])
                     T.writes(B[0, j])
                     B[0, j] = A[i, j] + 1.0
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     T.reads(B[0, j])
                     T.writes(C[i, j])
                     C[i, j] = B[0, j] * 2.0
@@ -74,12 +74,12 @@ def compacted_gpu_func(a: T.handle, c: T.handle) -> None:
                     T.writes(C[i0 * 4 + i1 * 2 + i2, 0:16])
                     B = T.alloc_buffer([1, 16], "float32", scope="local")
                     for j in range(0, 16):
-                        with T.block() as []:
+                        with T.block():
                             T.reads(A[i0 * 4 + i1 * 2 + i2, j])
                             T.writes(B[0, j])
                             B[0, j] = A[i0 * 4 + i1 * 2 + i2, j] + 1.0
                     for j in range(0, 16):
-                        with T.block() as []:
+                        with T.block():
                             T.reads(B[0, j])
                             T.writes(C[i0 * 4 + i1 * 2 + i2, j])
                             C[i0 * 4 + i1 * 2 + i2, j] = B[0, j] * 2.0
@@ -117,12 +117,12 @@ def compacted_symbolic_func(a: T.handle, c: T.handle, n: T.int32, m: T.int32) ->
             T.writes(C[i, m])
             B = T.alloc_buffer((m,), "float32", scope="global")
             for j in range(0, m):
-                with T.block() as []:
+                with T.block():
                     T.reads(A[i, j])
                     T.writes(B[j])
                     B[j] = A[i, j] + 1.0
             for j in range(0, m):
-                with T.block() as []:
+                with T.block():
                     T.reads(B[j])
                     T.writes(C[i, j])
                     C[i, j] = B[j] * 2.0
@@ -149,7 +149,7 @@ def compacted_predicate_func(a: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, (32), "float32")
 
     for i, j in T.grid(5, 7):
-        with T.block() as []:
+        with T.block():
             T.reads(A[i * 7 + j])
             T.writes(C[i * 7 + j])
             T.where(i * 7 + j < 32)
@@ -174,7 +174,7 @@ def compacted_unit_loop_func(a: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, (32), "float32")
 
     for x, y, z in T.grid(4, 1, 8):
-        with T.block() as []:
+        with T.block():
             T.reads(A[x * 8 + y * 8 + z])
             T.writes(C[x * 8 + y * 8 + z])
             C[x * 8 + y * 8 + z] = A[x * 8 + y * 8 + z] + 1.0
@@ -197,7 +197,7 @@ def compacted_multi_alloc_func(a: T.handle, d: T.handle) -> None:
     D = T.match_buffer(d, (32), "float32")
 
     for i in range(0, 32):
-        with T.block() as []:
+        with T.block():
             T.reads(A[i])
             T.writes(D[i])
             B = T.alloc_buffer((32,), scope="global")
@@ -233,13 +233,13 @@ def compacted_strided_buffer_func(a: T.handle, c: T.handle) -> None:
             B = T.alloc_buffer([4, 16], "float32", strides=[17, 1], scope="global")
             for i1 in range(0, 4):
                 for j in range(0, 16):
-                    with T.block() as []:
+                    with T.block():
                         T.reads(A[i0 * 4 + i1, j])
                         T.writes(B[i1, j])
                         B[i1, j] = A[i0 * 4 + i1, j] + 1.0
             for i1 in range(0, 4):
                 for j in range(0, 16):
-                    with T.block() as []:
+                    with T.block():
                         T.reads(B[i1, j])
                         T.writes(C[i0 * 4 + i1, j])
                         C[i0 * 4 + i1, j] = B[i1, j] * 2.0
diff --git a/tests/python/unittest/test_tvmscript_complete.py b/tests/python/unittest/test_tvmscript_complete.py
index 17e6d94e67443..c4b4afb24f820 100644
--- a/tests/python/unittest/test_tvmscript_complete.py
+++ b/tests/python/unittest/test_tvmscript_complete.py
@@ -62,7 +62,7 @@ def elementwise_with_root(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, [128, 128])
     C = T.match_buffer(c, [128, 128])
 
-    with T.block() as []:
+    with T.block():
         for i, j in T.grid(128, 128):
             with T.block():
                 vi, vj = T.axis.remap("SS", [i, j])
@@ -78,8 +78,8 @@ def func_with_opaque_block(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, [128, 128])
     C = T.match_buffer(c, [128, 128])
 
-    with T.block() as []:
-        with T.block() as []:
+    with T.block():
+        with T.block():
             B[0, 0] = A[0, 0] + T.float32(1)
         for i, j in T.grid(128, 128):
             with T.block():
@@ -93,7 +93,7 @@ def func_with_part_access_region(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, [128, 128])
     C = T.match_buffer(c, [128, 128])
 
-    with T.block() as []:
+    with T.block():
         for i, j in T.grid(128, 128):
             with T.block():
                 vi, vj = T.axis.remap("SS", [i, j])
@@ -263,7 +263,7 @@ def match_buffer_func(a: T.handle) -> None:
             A0 = T.match_buffer(A[i, 0:16], (16))
             with T.block():
                 for j in range(0, 16):
-                    with T.block() as []:
+                    with T.block():
                         A1 = T.match_buffer(A0[j], ())
                         A1[()] = 1.0
 
@@ -280,7 +280,7 @@ def expected_match_buffer_func(a: T.handle) -> None:
                 T.reads([])
                 T.writes(A0[0:16])
                 for j in range(0, 16):
-                    with T.block() as []:
+                    with T.block():
                         T.reads([])
                         T.writes(A0[j])
                         A1 = T.match_buffer(A0[j], ())
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index 948a762168318..f6db826dfda62 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -2840,7 +2840,7 @@ def rank0_block(a: T.handle) -> None:
         B = T.alloc_buffer((), "float32")
         B[()] = A[()]
 
-        with T.block("update") as []:
+        with T.block("update"):
             T.reads([A[()]])
             T.writes([B[()]])
             for i in range(1):