From 88413806fd9f7f27d65f55d4a961954a8aa47690 Mon Sep 17 00:00:00 2001
From: AndrewZhaoLuo <andrew.zhao.luo@gmail.com>
Date: Thu, 7 Jul 2022 16:45:17 -0700
Subject: [PATCH] [Pylint] Pylint integration_tests folder (#11672)

* add folder to pylint

* add init py

* lint test_arm_mrpofile_dsp.py

* one more change to tests/python/integratoin/test_arm_mprofile_dsp.py

* add test_dot

* test_ewise_fpga.py

* test_ewise.py

* test gemm

* test_lower.py

* test_meta_schedule_auto_tensorize.py

* test_reduce.py pt1

* test_reduce.py pt2

* test_scan.py

* test_tuning.py

* test_winograd_nnpack.py

* final test pass

* comments

* clean up test_lower more
---
 tests/lint/pylint.sh                          |   1 +
 tests/python/integration/__init__.py          |  17 +
 .../integration/test_arm_mprofile_dsp.py      |  10 +-
 tests/python/integration/test_dot.py          |  43 +-
 tests/python/integration/test_ewise.py        | 278 +++++----
 tests/python/integration/test_ewise_fpga.py   |  75 ++-
 tests/python/integration/test_gemm.py         | 115 ++--
 tests/python/integration/test_lower.py        | 360 ++++++-----
 .../test_meta_schedule_auto_tensorize.py      |  61 +-
 tests/python/integration/test_reduce.py       | 585 ++++++++++--------
 tests/python/integration/test_scan.py         |  59 +-
 tests/python/integration/test_tuning.py       | 188 +++---
 .../integration/test_winograd_nnpack.py       |  67 +-
 13 files changed, 1089 insertions(+), 770 deletions(-)
 create mode 100644 tests/python/integration/__init__.py

diff --git a/tests/lint/pylint.sh b/tests/lint/pylint.sh
index 39568fd3417e..61ffb0fd9254 100755
--- a/tests/lint/pylint.sh
+++ b/tests/lint/pylint.sh
@@ -23,3 +23,4 @@ python3 -m pylint tests/python/unittest/test_tvmscript_type.py --rcfile="$(dirna
 python3 -m pylint tests/python/contrib/test_cmsisnn --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/relay/aot/*.py --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/ci --rcfile="$(dirname "$0")"/pylintrc
+python3 -m pylint tests/python/integration/ --rcfile="$(dirname "$0")"/pylintrc
diff --git a/tests/python/integration/__init__.py b/tests/python/integration/__init__.py
new file mode 100644
index 000000000000..56984ac61535
--- /dev/null
+++ b/tests/python/integration/__init__.py
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Infrastructure and tests for e2e integration tests."""
diff --git a/tests/python/integration/test_arm_mprofile_dsp.py b/tests/python/integration/test_arm_mprofile_dsp.py
index 2bcf284f3d77..22b4ebaab832 100644
--- a/tests/python/integration/test_arm_mprofile_dsp.py
+++ b/tests/python/integration/test_arm_mprofile_dsp.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
+"""Test arm mprofile dsp."""
 import numpy as np
 import pytest
 import tvm
@@ -173,16 +173,16 @@ def test_conv1d(data_shape_nwc, kernel_size, num_filter, strides, padding, dtype
 
 @tvm.testing.requires_corstone300
 @pytest.mark.parametrize(
-    "M, K, N",
+    "dim_m, dim_k, dim_n",
     [
         (1, 32, 64),
         (3, 12, 10),
     ],
 )
-def test_dense(M, K, N):
+def test_dense(dim_m, dim_k, dim_n):
     """Test a subgraph with a single dense operator."""
-    ishape = (M, K)
-    wshape = (N, K)
+    ishape = (dim_m, dim_k)
+    wshape = (dim_n, dim_k)
 
     input0 = relay.var("input", relay.TensorType(ishape, "int8"))
     dense_f = relay.op.nn.batch_flatten(input0)
diff --git a/tests/python/integration/test_dot.py b/tests/python/integration/test_dot.py
index 41abb51a2e99..20e628c8c14b 100644
--- a/tests/python/integration/test_dot.py
+++ b/tests/python/integration/test_dot.py
@@ -14,31 +14,46 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Test scheduling and running a dot product."""
+import numpy as np
+
 import tvm
 import tvm.testing
 from tvm import te
-import numpy as np
 
 
 @tvm.testing.requires_llvm
 def test_dot():
-    nn = 12
-    n = tvm.runtime.convert(nn)
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    k = te.reduce_axis((0, n), "k")
-    C = te.compute((), lambda: te.sum(A[k] * B[k], axis=k), name="C")
-    s = te.create_schedule(C.op)
+    """Test dot product."""
+    arr_length = 12
+    arr_length_tvm = tvm.runtime.convert(arr_length)
+    placeholder_a = te.placeholder((arr_length_tvm,), name="A")
+    placeholder_b = te.placeholder((arr_length_tvm,), name="B")
+    reduce_axis_k = te.reduce_axis((0, arr_length_tvm), "k")
+    result_c = te.compute(
+        (),
+        lambda: te.sum(
+            placeholder_a[reduce_axis_k] * placeholder_b[reduce_axis_k], axis=reduce_axis_k
+        ),
+        name="C",
+    )
+    schedule = te.create_schedule(result_c.op)
 
     def verify(target):
-        f = tvm.driver.build(s, [A, B, C], target)
+        f = tvm.driver.build(schedule, [placeholder_a, placeholder_b, result_c], target)
         # verify
         dev = tvm.cpu(0)
-        a = tvm.nd.array(np.random.uniform(size=(nn,)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=(nn,)).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros((), dtype=C.dtype), dev)
-        f(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), np.dot(a.numpy(), b.numpy()), rtol=1e-4)
+        buff_a = tvm.nd.array(
+            np.random.uniform(size=(arr_length,)).astype(placeholder_a.dtype), dev
+        )
+        buff_b = tvm.nd.array(
+            np.random.uniform(size=(arr_length,)).astype(placeholder_b.dtype), dev
+        )
+        buff_c = tvm.nd.array(np.zeros((), dtype=result_c.dtype), dev)
+        f(buff_a, buff_b, buff_c)
+        tvm.testing.assert_allclose(
+            buff_c.numpy(), np.dot(buff_a.numpy(), buff_b.numpy()), rtol=1e-4
+        )
 
     verify("llvm")
 
diff --git a/tests/python/integration/test_ewise.py b/tests/python/integration/test_ewise.py
index 3250efc3f71e..8bfa6b17175d 100644
--- a/tests/python/integration/test_ewise.py
+++ b/tests/python/integration/test_ewise.py
@@ -14,26 +14,29 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Test elementwise integration."""
+import numpy as np
+
 import tvm
+import tvm.testing
 from tvm import te
 from tvm.contrib import nvcc
-import numpy as np
-import time
-import tvm.testing
 
 
 @tvm.testing.requires_gpu
 def test_exp():
+    """Test scheduling and running exponent."""
     # graph
-    n = tvm.runtime.convert(1024)
-    A = te.placeholder((n,), name="A")
-    B = te.compute(A.shape, lambda *i: te.exp(A(*i)), name="B")
-    s = te.create_schedule(B.op)
+    arr_length = 1024
+    arr_length_tvm = tvm.runtime.convert(arr_length)
+    placeholder_a = te.placeholder((arr_length_tvm,), name="A")
+    placeholder_b = te.compute(placeholder_a.shape, lambda *i: te.exp(placeholder_a(*i)), name="B")
+    schedule = te.create_schedule(placeholder_b.op)
     # create iter var and assign them tags.
     num_thread = 8
-    bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
-    s[B].bind(bx, te.thread_axis("blockIdx.x"))
-    s[B].bind(tx, te.thread_axis("threadIdx.x"))
+    axis1, axis2 = schedule[placeholder_b].split(placeholder_b.op.axis[0], factor=num_thread)
+    schedule[placeholder_b].bind(axis1, te.thread_axis("blockIdx.x"))
+    schedule[placeholder_b].bind(axis2, te.thread_axis("threadIdx.x"))
 
     # one line to build the function.
     def check_device(device, host="stackvm"):
@@ -43,14 +46,13 @@ def check_device(device, host="stackvm"):
         if not tvm.testing.device_enabled(device):
             print("skip because %s is not enabled.." % device)
             return
-        fexp = tvm.build(s, [A, B], device, host, name="myexp")
+        fexp = tvm.build(schedule, [placeholder_a, placeholder_b], device, host, name="myexp")
         dev = tvm.device(device, 0)
         # launch the kernel.
-        n = 1024
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev)
-        fexp(a, b)
-        tvm.testing.assert_allclose(b.numpy(), np.exp(a.numpy()), rtol=1e-5)
+        buff_a = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_a.dtype), dev)
+        buff_b = tvm.nd.array(np.zeros(arr_length, dtype=placeholder_b.dtype), dev)
+        fexp(buff_a, buff_b)
+        tvm.testing.assert_allclose(buff_b.numpy(), np.exp(buff_a.numpy()), rtol=1e-5)
 
     check_device("opencl -device=intel_graphics")
     check_device("cuda", "llvm")
@@ -59,16 +61,19 @@ def check_device(device, host="stackvm"):
 
 @tvm.testing.requires_gpu
 def test_fmod():
+    """Test scheduling and running fmod."""
     # graph
     def run(dtype):
-        n = te.size_var("n")
-        A = te.placeholder((n,), name="A", dtype=dtype)
-        B = te.placeholder((n,), name="B", dtype=dtype)
-        C = te.compute(A.shape, lambda *i: te.fmod(A(*i), B(*i)), name="C")
-        s = te.create_schedule(C.op)
+        size_var_n = te.size_var("n")
+        placeholder_a = te.placeholder((size_var_n,), name="A", dtype=dtype)
+        placeholder_b = te.placeholder((size_var_n,), name="B", dtype=dtype)
+        result_c = te.compute(
+            placeholder_a.shape, lambda *i: te.fmod(placeholder_a(*i), placeholder_b(*i)), name="C"
+        )
+        schedule = te.create_schedule(result_c.op)
         # create iter var and assign them tags.
         num_thread = 8
-        bx, tx = s[C].split(C.op.axis[0], factor=num_thread)
+        axis0, axis1 = schedule[result_c].split(result_c.op.axis[0], factor=num_thread)
 
         def check_device(device):
             dev = tvm.device(device, 0)
@@ -77,26 +82,29 @@ def check_device(device):
                 return
             target = tvm.target.Target(device)
             if "cpu" not in target.keys:
-                s[C].bind(bx, te.thread_axis("blockIdx.x"))
-                s[C].bind(tx, te.thread_axis("threadIdx.x"))
-            fmod = tvm.build(s, [A, B, C], device, name="myfmod")
+                schedule[result_c].bind(axis0, te.thread_axis("blockIdx.x"))
+                schedule[result_c].bind(axis1, te.thread_axis("threadIdx.x"))
+            fmod = tvm.build(
+                schedule, [placeholder_a, placeholder_b, result_c], device, name="myfmod"
+            )
 
             # launch the kernel.
-            n = 1024
-            a_np = (np.random.uniform(size=n) * 256).astype(A.dtype)
-            b_np = (np.random.uniform(size=n) * 256).astype(B.dtype)
+            value_n = 1024
+            a_np = (np.random.uniform(size=value_n) * 256).astype(placeholder_a.dtype)
+            b_np = (np.random.uniform(size=value_n) * 256).astype(placeholder_b.dtype)
 
             # "fix" the values in a and b to avoid the result being too small
             b_np += (b_np < 2.0) * 2
             a_np[np.abs(np.fmod(a_np, b_np)) < 1] += 1
 
-            a = tvm.nd.array(a_np, dev)
-            b = tvm.nd.array(b_np, dev)
-            c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
+            buff_a = tvm.nd.array(a_np, dev)
+            buff_b = tvm.nd.array(b_np, dev)
+            buff_c = tvm.nd.array(np.zeros(value_n, dtype=result_c.dtype), dev)
             ftimer = fmod.time_evaluator(fmod.entry_name, dev, number=1)
-            tcost = ftimer(a, b, c).mean
-            # fmod(a, b, c)
-            np.testing.assert_allclose(c.numpy(), np.mod(a.numpy(), b.numpy()), rtol=1e-5)
+            _ = ftimer(buff_a, buff_b, buff_c).mean
+            np.testing.assert_allclose(
+                buff_c.numpy(), np.mod(buff_a.numpy(), buff_b.numpy()), rtol=1e-5
+            )
 
         check_device("cuda")
         check_device("opencl -device=intel_graphics")
@@ -107,21 +115,30 @@ def check_device(device):
 
 @tvm.testing.requires_gpu
 def test_multiple_cache_write():
+    """Test multiple cache writes."""
     # graph
-    n = tvm.runtime.convert(1024)
-    A0 = te.placeholder((n,), name="A0", dtype="float32")
-    A1 = te.placeholder((n,), name="A1", dtype="float32")
-    B0, B1 = te.compute((n,), lambda *i: (A0(*i) + A1(*i), A0(*i) * A1(*i)), name="B")
-    C = te.compute((n,), lambda *i: B0(*i) + B1(*i), name="C")
-    s = te.create_schedule(C.op)
+    arr_length = 1024
+    arr_length_tvm = tvm.runtime.convert(arr_length)
+    placeholder_a0 = te.placeholder((arr_length_tvm,), name="A0", dtype="float32")
+    placeholder_a1 = te.placeholder((arr_length_tvm,), name="A1", dtype="float32")
+    result_b0, result_b1 = te.compute(
+        (arr_length_tvm,),
+        lambda *i: (
+            placeholder_a0(*i) + placeholder_a1(*i),
+            placeholder_a0(*i) * placeholder_a1(*i),
+        ),
+        name="B",
+    )
+    result_c = te.compute((arr_length_tvm,), lambda *i: result_b0(*i) + result_b1(*i), name="C")
+    schedule = te.create_schedule(result_c.op)
     # create iter var and assign them tags.
     num_thread = 8
-    B0_cache, B1_cache = s.cache_write([B0, B1], "local")
-    bx, tx = s[C].split(C.op.axis[0], factor=num_thread)
-    s[B0].compute_at(s[C], bx)
-    s[B0_cache].compute_at(s[C], bx)
-    s[C].bind(bx, te.thread_axis("blockIdx.x"))
-    s[C].bind(tx, te.thread_axis("threadIdx.x"))
+    cache_b0, _ = schedule.cache_write([result_b0, result_b1], "local")
+    axis0, axis1 = schedule[result_c].split(result_c.op.axis[0], factor=num_thread)
+    schedule[result_b0].compute_at(schedule[result_c], axis0)
+    schedule[cache_b0].compute_at(schedule[result_c], axis0)
+    schedule[result_c].bind(axis0, te.thread_axis("blockIdx.x"))
+    schedule[result_c].bind(axis1, te.thread_axis("threadIdx.x"))
     # one line to build the function.
     def check_device(device, host="stackvm"):
         if not tvm.testing.device_enabled(host):
@@ -129,16 +146,23 @@ def check_device(device, host="stackvm"):
         dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             return
-        func = tvm.build(s, [A0, A1, C], device, host, name="multiple_cache_write")
+        func = tvm.build(
+            schedule,
+            [placeholder_a0, placeholder_a1, result_c],
+            device,
+            host,
+            name="multiple_cache_write",
+        )
         dev = tvm.device(device, 0)
         # launch the kernel.
-        n = 1024
-        a0 = tvm.nd.array(np.random.uniform(size=n).astype(A0.dtype), dev)
-        a1 = tvm.nd.array(np.random.uniform(size=n).astype(A1.dtype), dev)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
-        func(a0, a1, c)
+        buff_a0 = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_a0.dtype), dev)
+        buff_a1 = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_a1.dtype), dev)
+        buff_c = tvm.nd.array(np.zeros(arr_length, dtype=result_c.dtype), dev)
+        func(buff_a0, buff_a1, buff_c)
         tvm.testing.assert_allclose(
-            c.numpy(), a0.numpy() + a1.numpy() + (a0.numpy() * a1.numpy()), rtol=1e-5
+            buff_c.numpy(),
+            buff_a0.numpy() + buff_a1.numpy() + (buff_a0.numpy() * buff_a1.numpy()),
+            rtol=1e-5,
         )
 
     check_device("cuda", "llvm")
@@ -147,41 +171,49 @@ def check_device(device, host="stackvm"):
 
 
 def test_log_pow_llvm():
+    """Test log pow using llvm to lower."""
     # graph
-    n = te.size_var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.compute(A.shape, lambda *i: te.power(te.log(A(*i)), 2.0), name="B")
-    s = te.create_schedule(B.op)
+    size_var_n = te.size_var("n")
+    placeholder_a = te.placeholder((size_var_n,), name="A")
+    result_b = te.compute(
+        placeholder_a.shape, lambda *i: te.power(te.log(placeholder_a(*i)), 2.0), name="B"
+    )
+    schedule = te.create_schedule(result_b.op)
     # create iter var and assign them tags.
-    bx, tx = s[B].split(B.op.axis[0], factor=32)
+    schedule[result_b].split(result_b.op.axis[0], factor=32)
     # one line to build the function.
     if not tvm.testing.device_enabled("llvm"):
         return
 
-    flog = tvm.build(s, [A, B], "llvm", name="mylog")
+    flog = tvm.build(schedule, [placeholder_a, result_b], "llvm", name="mylog")
     dev = tvm.cpu(0)
     # launch the kernel.
-    n = 1028
-    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-    b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev)
+    size_var_n = 1028
+    buff_a = tvm.nd.array(np.random.uniform(size=size_var_n).astype(placeholder_a.dtype), dev)
+    buff_b = tvm.nd.array(np.zeros(size_var_n, dtype=result_b.dtype), dev)
     repeat = 10
     ftimer = flog.time_evaluator(flog.entry_name, dev, number=1, repeat=repeat)
-    res = ftimer(a, b)
+    res = ftimer(buff_a, buff_b)
     assert len(res.results) == repeat
-    tvm.testing.assert_allclose(b.numpy(), np.power(np.log(a.numpy()), 2.0), rtol=1e-5)
+    tvm.testing.assert_allclose(buff_b.numpy(), np.power(np.log(buff_a.numpy()), 2.0), rtol=1e-5)
 
 
 @tvm.testing.uses_gpu
 def test_popcount():
+    """Test popcount."""
+
     def run(dtype):
         # graph
-        n = tvm.runtime.convert(1024)
-        A = te.placeholder((n,), name="A", dtype=dtype)
-        B = te.compute(A.shape, lambda *i: tvm.tir.popcount(A(*i)), name="B")
-        s = te.create_schedule(B.op)
+        arr_length = 1024
+        arr_length_tvm = tvm.runtime.convert(1024)
+        placeholder_a = te.placeholder((arr_length_tvm,), name="A", dtype=dtype)
+        placeholder_b = te.compute(
+            placeholder_a.shape, lambda *i: tvm.tir.popcount(placeholder_a(*i)), name="B"
+        )
+        schedule = te.create_schedule(placeholder_b.op)
         # simple schedule
         num_thread = 8
-        bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
+        axis1, axis2 = schedule[placeholder_b].split(placeholder_b.op.axis[0], factor=num_thread)
 
         def check_device(device):
             dev = tvm.device(device, 0)
@@ -190,16 +222,17 @@ def check_device(device):
                 return
             target = tvm.target.Target(device)
             if "cpu" not in target.keys:
-                s[B].bind(bx, te.thread_axis("blockIdx.x"))
-                s[B].bind(tx, te.thread_axis("threadIdx.x"))
-            func = tvm.build(s, [A, B], device)
+                schedule[placeholder_b].bind(axis1, te.thread_axis("blockIdx.x"))
+                schedule[placeholder_b].bind(axis2, te.thread_axis("threadIdx.x"))
+            func = tvm.build(schedule, [placeholder_a, placeholder_b], device)
             # launch the kernel.
-            n = 1024
-            a = tvm.nd.array(np.random.randint(low=0, high=1000, size=n, dtype=A.dtype), dev)
-            b = tvm.nd.array(np.zeros(shape=n, dtype=B.dtype), dev)
-            func(a, b)
+            buff_a = tvm.nd.array(
+                np.random.randint(low=0, high=1000, size=arr_length, dtype=placeholder_a.dtype), dev
+            )
+            buff_b = tvm.nd.array(np.zeros(shape=arr_length, dtype=placeholder_b.dtype), dev)
+            func(buff_a, buff_b)
             tvm.testing.assert_allclose(
-                b.numpy(), list(map(lambda x: bin(x).count("1"), a.numpy())), rtol=1e-5
+                buff_b.numpy(), list(map(lambda x: bin(x).count("1"), buff_a.numpy())), rtol=1e-5
             )
 
         check_device("llvm")
@@ -215,24 +248,26 @@ def check_device(device):
 
 @tvm.testing.requires_gpu
 def test_add():
+    """Test addition."""
+
     def run(dtype):
         # graph
-        n = te.size_var("n")
-        A = te.placeholder((n,), name="A", dtype=dtype)
-        B = te.placeholder((n,), name="B", dtype=dtype)
-        bias = te.var("bias", dtype=dtype)
-        scale = te.var("scale", dtype=dtype)
-        C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
+        size_var_n = te.size_var("n")
+        placeholder_a = te.placeholder((size_var_n,), name="A", dtype=dtype)
+        placeholder_b = te.placeholder((size_var_n,), name="B", dtype=dtype)
+        result_c = te.compute(
+            placeholder_a.shape, lambda *i: placeholder_a(*i) + placeholder_b(*i), name="C"
+        )
         # schedule
-        s = te.create_schedule(C.op)
+        schedule = te.create_schedule(result_c.op)
         # create iter var and assign them tags.
         num_thread = 16
-        bx, x = s[C].split(C.op.axis[0], factor=num_thread * 4)
-        tx, x = s[C].split(x, nparts=num_thread)
-        _, x = s[C].split(x, factor=4)
-        s[C].bind(bx, te.thread_axis("blockIdx.x"))
-        s[C].bind(tx, te.thread_axis("threadIdx.x"))
-        s[C].vectorize(x)
+        axis_bx, axis_x = schedule[result_c].split(result_c.op.axis[0], factor=num_thread * 4)
+        axis_tx, axis_x = schedule[result_c].split(axis_x, nparts=num_thread)
+        _, axis_x = schedule[result_c].split(axis_x, factor=4)
+        schedule[result_c].bind(axis_bx, te.thread_axis("blockIdx.x"))
+        schedule[result_c].bind(axis_tx, te.thread_axis("threadIdx.x"))
+        schedule[result_c].vectorize(axis_x)
 
         # one line to build the function.
         def check_device(device):
@@ -240,16 +275,22 @@ def check_device(device):
             if not tvm.testing.device_enabled(device):
                 print("skip because %s is not enabled.." % device)
                 return
-            fadd = tvm.build(s, [A, B, C], device, name="myadd")
+            fadd = tvm.build(
+                schedule, [placeholder_a, placeholder_b, result_c], device, name="myadd"
+            )
 
             # launch the kernel.
             n = 1024
-            a = tvm.nd.array((np.random.uniform(size=n) * 256).astype(A.dtype), dev)
-            b = tvm.nd.array((np.random.uniform(size=n) * 256).astype(B.dtype), dev)
-            c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
+            buff_a = tvm.nd.array(
+                (np.random.uniform(size=n) * 256).astype(placeholder_a.dtype), dev
+            )
+            buff_b = tvm.nd.array(
+                (np.random.uniform(size=n) * 256).astype(placeholder_b.dtype), dev
+            )
+            buff_c = tvm.nd.array(np.zeros(n, dtype=result_c.dtype), dev)
             ftimer = fadd.time_evaluator(fadd.entry_name, dev, number=1)
-            tcost = ftimer(a, b, c).mean
-            tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy(), rtol=1e-6)
+            _ = ftimer(buff_a, buff_b, buff_c).mean
+            tvm.testing.assert_allclose(buff_c.numpy(), buff_a.numpy() + buff_b.numpy(), rtol=1e-6)
 
         check_device("opencl")
         check_device("cuda")
@@ -265,25 +306,26 @@ def check_device(device):
 
 @tvm.testing.requires_gpu
 def try_warp_memory():
-    """skip this in default test because it require higher arch"""
-    m = 128
-    A = te.placeholder((m,), name="A")
-    B = te.compute((m,), lambda i: A[i] + 3, name="B")
+    """Test using warp memory
+    skip this in default test because it require higher arch"""
+    arr_size = 128
+    placeholder_a = te.placeholder((arr_size,), name="A")
+    result_b = te.compute((arr_size,), lambda i: placeholder_a[i] + 3, name="B")
     warp_size = 32
-    s = te.create_schedule(B.op)
-    AA = s.cache_read(A, "warp", [B])
-    xo, xi = s[B].split(B.op.axis[0], warp_size * 2)
-    xi0, xi1 = s[B].split(xi, factor=warp_size)
-    tx = te.thread_axis("threadIdx.x")
-    s[B].bind(xi1, tx)
-    s[B].bind(xo, te.thread_axis("blockIdx.x"))
-    s[AA].compute_at(s[B], xo)
-    xo, xi = s[AA].split(s[AA].op.axis[0], warp_size)
-    s[AA].bind(xi, tx)
+    schedule = te.create_schedule(result_b.op)
+    cache_read_aa = schedule.cache_read(placeholder_a, "warp", [result_b])
+    axis_x0, axis_xi = schedule[result_b].split(result_b.op.axis[0], warp_size * 2)
+    _, axis_xi1 = schedule[result_b].split(axis_xi, factor=warp_size)
+    thread_axis_tx = te.thread_axis("threadIdx.x")
+    schedule[result_b].bind(axis_xi1, thread_axis_tx)
+    schedule[result_b].bind(axis_x0, te.thread_axis("blockIdx.x"))
+    schedule[cache_read_aa].compute_at(schedule[result_b], axis_x0)
+    axis_x0, axis_xi = schedule[cache_read_aa].split(schedule[cache_read_aa].op.axis[0], warp_size)
+    schedule[cache_read_aa].bind(axis_xi, thread_axis_tx)
 
     @tvm.register_func("tvm_callback_cuda_compile", override=True)
-    def tvm_callback_cuda_compile(code):
-        ptx = nvcc.compile_cuda(code, target_format="ptx")
+    def tvm_callback_cuda_compile(code):  # pylint: disable=unused-variable
+        ptx = nvcc.compile_cuda(code)
         return ptx
 
     # one line to build the function.
@@ -292,11 +334,13 @@ def check_device(device):
         if not tvm.testing.device_enabled(device):
             print("skip because %s is not enabled.." % device)
             return
-        f = tvm.build(s, [A, B], device)
-        a = tvm.nd.array((np.random.uniform(size=m) * 256).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(m, dtype=B.dtype), dev)
-        f(a, b)
-        tvm.testing.assert_allclose(b.numpy(), a.numpy() + 3, rtol=1e-6)
+        myfunc = tvm.build(schedule, [placeholder_a, result_b], device)
+        buff_a = tvm.nd.array(
+            (np.random.uniform(size=arr_size) * 256).astype(placeholder_a.dtype), dev
+        )
+        buff_b = tvm.nd.array(np.zeros(arr_size, dtype=result_b.dtype), dev)
+        myfunc(buff_a, buff_b)
+        tvm.testing.assert_allclose(buff_b.numpy(), buff_a.numpy() + 3, rtol=1e-6)
 
     check_device("cuda")
 
diff --git a/tests/python/integration/test_ewise_fpga.py b/tests/python/integration/test_ewise_fpga.py
index 6171c37b1672..7b247d7d527f 100644
--- a/tests/python/integration/test_ewise_fpga.py
+++ b/tests/python/integration/test_ewise_fpga.py
@@ -14,11 +14,14 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Test elementwise ops on fpga."""
+import os
+
+import numpy as np
+
 import tvm
 import tvm.testing
 from tvm import te
-import numpy as np
-import os
 
 os.environ["XCL_EMULATION_MODE"] = "1"
 os.environ["CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA"] = "1"
@@ -32,28 +35,29 @@ def tvm_callback_vhls_postproc(code):
 
 
 def test_exp():
+    """Test scheduling and running exp function."""
     # graph
-    n = tvm.runtime.convert(1024)
-    A = te.placeholder((n,), name="A")
-    B = te.compute(A.shape, lambda *i: te.exp(A(*i)), name="B")
-    s = te.create_schedule(B.op)
+    arr_length = 1024
+    arr_length_tvm = tvm.runtime.convert(arr_length)
+    placeholder_b = te.placeholder((arr_length_tvm,), name="A")
+    result_b = te.compute(placeholder_b.shape, lambda *i: te.exp(placeholder_b(*i)), name="B")
+    schedule = te.create_schedule(result_b.op)
     # create iter var and assign them tags.
-    px, x = s[B].split(B.op.axis[0], nparts=1)
-    s[B].bind(px, te.thread_axis("pipeline"))
+    axis1, _ = schedule[result_b].split(result_b.op.axis[0], nparts=1)
+    schedule[result_b].bind(axis1, te.thread_axis("pipeline"))
 
     # one line to build the function.
     def check_device(device, host="llvm"):
         if not tvm.testing.device_enabled(device):
             return
         dev = tvm.device(device, 0)
-        fexp = tvm.build(s, [A, B], device, host, name="myexp")
+        fexp = tvm.build(schedule, [placeholder_b, result_b], device, host, name="myexp")
         dev = tvm.device(device, 0)
         # launch the kernel.
-        n = 1024
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev)
-        fexp(a, b)
-        tvm.testing.assert_allclose(b.numpy(), np.exp(a.numpy()), rtol=1e-5)
+        buff_a = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_b.dtype), dev)
+        buff_b = tvm.nd.array(np.zeros(arr_length, dtype=result_b.dtype), dev)
+        fexp(buff_a, buff_b)
+        tvm.testing.assert_allclose(buff_b.numpy(), np.exp(buff_a.numpy()), rtol=1e-5)
 
     check_device("sdaccel")
     if "AWS_PLATFORM" in os.environ:
@@ -63,34 +67,41 @@ def check_device(device, host="llvm"):
 
 
 def test_multi_kernel():
+    """Test scheduling with multiple computes."""
     # graph
-    n = tvm.runtime.convert(1024)
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
-    D = te.compute(A.shape, lambda *i: A(*i) + C(*i), name="D")
-    s = te.create_schedule(D.op)
+    arr_length = 1024
+    arr_length_tvm = tvm.runtime.convert(arr_length)
+    placeholder_a = te.placeholder((arr_length_tvm,), name="A")
+    placeholder_b = te.placeholder((arr_length_tvm,), name="B")
+    result_c = te.compute(
+        placeholder_a.shape, lambda *i: placeholder_a(*i) + placeholder_b(*i), name="C"
+    )
+    result_d = te.compute(
+        placeholder_a.shape, lambda *i: placeholder_a(*i) + result_c(*i), name="D"
+    )
+    schedule = te.create_schedule(result_d.op)
     # create iter var and assign them tags.
-    px, x = s[C].split(C.op.axis[0], nparts=1)
-    s[C].bind(px, te.thread_axis("pipeline"))
-    px, x = s[D].split(D.op.axis[0], nparts=1)
-    s[D].bind(px, te.thread_axis("pipeline"))
+    axis1, _ = schedule[result_c].split(result_c.op.axis[0], nparts=1)
+    schedule[result_c].bind(axis1, te.thread_axis("pipeline"))
+    axis1, _ = schedule[result_d].split(result_d.op.axis[0], nparts=1)
+    schedule[result_d].bind(axis1, te.thread_axis("pipeline"))
 
     # one line to build the function.
     def check_device(device, host="llvm"):
         if not tvm.testing.device_enabled(device):
             return
         dev = tvm.device(device, 0)
-        fadd = tvm.build(s, [A, B, C, D], device, host, name="myadd")
+        fadd = tvm.build(
+            schedule, [placeholder_a, placeholder_b, result_c, result_d], device, host, name="myadd"
+        )
         dev = tvm.device(device, 0)
         # launch the kernel.
-        n = 1024
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-        c = tvm.nd.array(np.random.uniform(size=n).astype(C.dtype), dev)
-        d = tvm.nd.array(np.random.uniform(size=n).astype(D.dtype), dev)
-        fadd(a, b, c, d)
-        tvm.testing.assert_allclose(d.numpy(), a.numpy() * 2 + b.numpy(), rtol=1e-5)
+        buff_a = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_a.dtype), dev)
+        buff_b = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_b.dtype), dev)
+        buff_c = tvm.nd.array(np.random.uniform(size=arr_length).astype(result_c.dtype), dev)
+        buff_d = tvm.nd.array(np.random.uniform(size=arr_length).astype(result_d.dtype), dev)
+        fadd(buff_a, buff_b, buff_c, buff_d)
+        tvm.testing.assert_allclose(buff_d.numpy(), buff_a.numpy() * 2 + buff_b.numpy(), rtol=1e-5)
 
     check_device("sdaccel")
     check_device("aocl_sw_emu")
diff --git a/tests/python/integration/test_gemm.py b/tests/python/integration/test_gemm.py
index aa6c5a1e74e1..66d777989d8c 100644
--- a/tests/python/integration/test_gemm.py
+++ b/tests/python/integration/test_gemm.py
@@ -14,27 +14,32 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import tvm
-from tvm import te
+"""Test scheduling and running a gemm!"""
 import numpy as np
-import time
+
+import tvm
 import tvm.testing
+from tvm import te
 
 
 @tvm.testing.requires_gpu
 def test_gemm():
+    """Test the gemm!"""
     # graph
-    nn = 1024
-    n = tvm.runtime.convert(nn)
-    m = n
-    l = n
-    A = te.placeholder((n, l), name="A")
-    B = te.placeholder((m, l), name="B")
-    k = te.reduce_axis((0, l), name="k")
-    C = te.compute((n, m), lambda ii, jj: te.sum(A[ii, k] * B[jj, k], axis=k), name="CC")
+    dim1_length = 1024
+    dim_n = tvm.runtime.convert(dim1_length)
+    dim_m = dim_n
+    dim_l = dim_n
+    placeholder_a = te.placeholder((dim_n, dim_l), name="A")
+    placeholder_b = te.placeholder((dim_m, dim_l), name="B")
+    axis_k = te.reduce_axis((0, dim_l), name="k")
+    result_c = te.compute(
+        (dim_n, dim_m),
+        lambda ii, jj: te.sum(placeholder_a[ii, axis_k] * placeholder_b[jj, axis_k], axis=axis_k),
+        name="CC",
+    )
     # schedule
-    s = te.create_schedule(C.op)
-    xtile, ytile = 32, 32
+    schedule = te.create_schedule(result_c.op)
     scale = 8
     num_thread = 8
     block_factor = scale * num_thread
@@ -43,39 +48,43 @@ def test_gemm():
     block_y = te.thread_axis("blockIdx.y")
     thread_y = te.thread_axis("threadIdx.y")
 
-    CC = s.cache_write(C, "local")
-    AA = s.cache_read(A, "shared", [CC])
-    BB = s.cache_read(B, "shared", [CC])
-    by, yi = s[C].split(C.op.axis[0], factor=block_factor)
-    bx, xi = s[C].split(C.op.axis[1], factor=block_factor)
-    s[C].reorder(by, bx, yi, xi)
-    s[C].bind(by, block_y)
-    s[C].bind(bx, block_x)
-    ty, yi = s[C].split(yi, nparts=num_thread)
-    tx, xi = s[C].split(xi, nparts=num_thread)
-    s[C].reorder(ty, tx, yi, xi)
-    s[C].bind(ty, thread_y)
-    s[C].bind(tx, thread_x)
-    yo, xo = CC.op.axis
-    s[CC].reorder(k, yo, xo)
+    cache_write = schedule.cache_write(result_c, "local")
+    cache_read_a = schedule.cache_read(placeholder_a, "shared", [cache_write])
+    cache_read_b = schedule.cache_read(placeholder_b, "shared", [cache_write])
+    axis_by, axis_yi = schedule[result_c].split(result_c.op.axis[0], factor=block_factor)
+    axis_bx, axis_xi = schedule[result_c].split(result_c.op.axis[1], factor=block_factor)
+    schedule[result_c].reorder(axis_by, axis_bx, axis_yi, axis_xi)
+    schedule[result_c].bind(axis_by, block_y)
+    schedule[result_c].bind(axis_bx, block_x)
+    axis_ty, axis_yi = schedule[result_c].split(axis_yi, nparts=num_thread)
+    axis_tx, axis_xi = schedule[result_c].split(axis_xi, nparts=num_thread)
+    schedule[result_c].reorder(axis_ty, axis_tx, axis_yi, axis_xi)
+    schedule[result_c].bind(axis_ty, thread_y)
+    schedule[result_c].bind(axis_tx, thread_x)
+    axis_yo, axis_xo = cache_write.op.axis
+    schedule[cache_write].reorder(axis_k, axis_yo, axis_xo)
 
-    s[CC].compute_at(s[C], tx)
-    s[AA].compute_at(s[CC], k)
-    s[BB].compute_at(s[CC], k)
-    s[AA].double_buffer()
-    s[BB].double_buffer()
-    ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread)
-    tx, xi = s[AA].split(xi, nparts=num_thread)
-    s[AA].bind(ty, thread_y)
-    s[AA].bind(tx, thread_x)
+    schedule[cache_write].compute_at(schedule[result_c], axis_tx)
+    schedule[cache_read_a].compute_at(schedule[cache_write], axis_k)
+    schedule[cache_read_b].compute_at(schedule[cache_write], axis_k)
+    schedule[cache_read_a].double_buffer()
+    schedule[cache_read_b].double_buffer()
+    axis_ty, axis_xi = schedule[cache_read_a].split(
+        schedule[cache_read_a].op.axis[0], nparts=num_thread
+    )
+    axis_tx, axis_xi = schedule[cache_read_a].split(axis_xi, nparts=num_thread)
+    schedule[cache_read_a].bind(axis_ty, thread_y)
+    schedule[cache_read_a].bind(axis_tx, thread_x)
 
-    ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread)
-    tx, xi = s[BB].split(xi, nparts=num_thread)
-    s[BB].bind(ty, thread_y)
-    s[BB].bind(tx, thread_x)
+    axis_ty, axis_xi = schedule[cache_read_b].split(
+        schedule[cache_read_b].op.axis[0], nparts=num_thread
+    )
+    axis_tx, axis_xi = schedule[cache_read_b].split(axis_xi, nparts=num_thread)
+    schedule[cache_read_b].bind(axis_ty, thread_y)
+    schedule[cache_read_b].bind(axis_tx, thread_x)
 
     # lowering test
-    s = s.normalize()
+    schedule = schedule.normalize()
 
     # one line to build the function.
     def check_device(device):
@@ -85,21 +94,21 @@ def check_device(device):
             return
 
         with tvm.target.Target(device):
-            f = tvm.build(s, [A, B, C])
+            f = tvm.build(schedule, [placeholder_a, placeholder_b, result_c])
 
         # launch the kernel.
-        n = nn
-        m = n
-        l = n
-        a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
-        b_np = np.random.uniform(size=(m, l)).astype(B.dtype)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev)
+        num_n = dim1_length
+        num_m = num_n
+        num_l = num_n
+        a_np = np.random.uniform(size=(num_n, num_l)).astype(placeholder_a.dtype)
+        b_np = np.random.uniform(size=(num_m, num_l)).astype(placeholder_b.dtype)
+        buff_a = tvm.nd.array(a_np, dev)
+        buff_b = tvm.nd.array(b_np, dev)
+        buff_c = tvm.nd.array(np.zeros((num_n, num_m), dtype=result_c.dtype), dev)
         ftimer = f.time_evaluator(f.entry_name, dev, number=1)
-        tcost = ftimer(a, b, c).mean
+        tcost = ftimer(buff_a, buff_b, buff_c).mean
         print("%s: exec=%g sec/op" % (dev, tcost))
-        tvm.testing.assert_allclose(c.numpy(), np.dot(a_np, b_np.T), rtol=1e-5)
+        tvm.testing.assert_allclose(buff_c.numpy(), np.dot(a_np, b_np.T), rtol=1e-5)
 
     check_device("vulkan")
     check_device("nvptx -mcpu=sm_20")
diff --git a/tests/python/integration/test_lower.py b/tests/python/integration/test_lower.py
index 63733b05ab3f..1ccdde8b1337 100644
--- a/tests/python/integration/test_lower.py
+++ b/tests/python/integration/test_lower.py
@@ -14,42 +14,52 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name, too-many-locals, too-many-statements, unused-argument
-"""Test workload for lowering and build"""
+"""Test workload for lowering and build."""
+import numpy as np
+
 import tvm
-from tvm import tir
-from tvm.script import tir as T
 import tvm.testing
-import numpy as np
+from tvm.script import tir as T
 
 
 @T.prim_func
-def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None:
+def tensorcore_gemm(handle_a: T.handle, handle_b: T.handle, handle_c: T.handle) -> None:
+    # pylint: disable=missing-function-docstring
     # match buffer
-    A = T.match_buffer(a, [1024, 1024], "float16")
-    B = T.match_buffer(b, [1024, 1024], "float16")
-    C = T.match_buffer(c, [1024, 1024], "float32")
+    match_buffer_a = T.match_buffer(handle_a, [1024, 1024], "float16")
+    match_buffer_b = T.match_buffer(handle_b, [1024, 1024], "float16")
+    match_buffer_c = T.match_buffer(handle_c, [1024, 1024], "float32")
 
     # body
-    for blockIdx_x in T.thread_binding(0, 16, "blockIdx.x"):
-        for blockIdx_y in T.thread_binding(0, 8, "blockIdx.y"):
+    for block_idx_x in T.thread_binding(0, 16, "blockIdx.x"):
+        for block_idx_y in T.thread_binding(0, 8, "blockIdx.y"):
             with T.block():
-                bx, by = T.axis.remap("SS", [blockIdx_x, blockIdx_y])
-                shared_A = T.alloc_buffer([1024, 1024], "float16", scope="shared")
-                shared_B = T.alloc_buffer([1024, 1024], "float16", scope="shared")
-                wmma_A = T.alloc_buffer([1024, 1024], "float16", scope="wmma.matrix_a")
-                wmma_B = T.alloc_buffer([1024, 1024], "float16", scope="wmma.matrix_b")
-                wmma_C = T.alloc_buffer([1024, 1024], "float32", scope="wmma.accumulator")
-                for ty in T.thread_binding(0, 2, "threadIdx.y"):
-                    for tz in T.thread_binding(0, 2, "threadIdx.z"):
-                        for i, j in T.grid(2, 4):
+                axis_bx, axis_by = T.axis.remap("SS", [block_idx_x, block_idx_y])
+                shared_a = T.alloc_buffer([1024, 1024], "float16", scope="shared")
+                shared_b = T.alloc_buffer([1024, 1024], "float16", scope="shared")
+                wmma_a = T.alloc_buffer([1024, 1024], "float16", scope="wmma.matrix_a")
+                wmma_b = T.alloc_buffer([1024, 1024], "float16", scope="wmma.matrix_b")
+                wmma_c = T.alloc_buffer([1024, 1024], "float32", scope="wmma.accumulator")
+
+                # pylint: disable=too-many-nested-blocks
+                for thread_ty in T.thread_binding(0, 2, "threadIdx.y"):
+                    for thread_tz in T.thread_binding(0, 2, "threadIdx.z"):
+                        for index_i, index_jj in T.grid(2, 4):
                             with T.block():
-                                vi = T.axis.S(64, bx * 4 + ty * 2 + i)
-                                vj = T.axis.S(64, by * 8 + tz * 4 + j)
+                                new_axis_vi = T.axis.S(64, axis_bx * 4 + thread_ty * 2 + index_i)
+                                new_axis_vj = T.axis.S(64, axis_by * 8 + thread_tz * 4 + index_jj)
                                 T.reads([])
-                                T.writes(wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
-                                C0 = T.match_buffer(
-                                    wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16],
+                                T.writes(
+                                    wmma_c[
+                                        new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                        new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                    ]
+                                )
+                                match_buffer_c0 = T.match_buffer(
+                                    wmma_c[
+                                        new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                        new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                    ],
                                     (16, 16),
                                     "float32",
                                     strides=[16 * 4, 1],
@@ -58,62 +68,92 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None:
                                 )
                                 T.evaluate(
                                     T.tvm_fill_fragment(
-                                        C0.data,
+                                        match_buffer_c0.data,
                                         16,
                                         16,
                                         16,
-                                        i * 4 + j,
-                                        T.float32(0),
+                                        index_i * 4 + index_jj,
+                                        T.float32(0),  # pylint: disable=not-callable
                                         dtype="handle",
                                     )
                                 )
 
-                        for ko in range(0, 32):
+                        for k_o in range(0, 32):
                             # copy data from global to shared
-                            for tx in T.thread_binding(0, 32, "threadIdx.x"):
-                                for i0, j0 in T.grid(1, 4):
-                                    for j1 in T.vectorized(0, 4):
+                            for thread_tx in T.thread_binding(0, 32, "threadIdx.x"):
+                                for index_i0, index_j0 in T.grid(1, 4):
+                                    for index_j1 in T.vectorized(0, 4):
                                         with T.block():
-                                            vi = T.axis.S(1024, bx * 64 + ty * 32 + tx + i0)
-                                            vj = T.axis.S(1024, ko * 32 + tz * 16 + j0 * 4 + j1)
-                                            shared_A[vi, vj + 8] = A[vi, vj]
+                                            new_axis_vi = T.axis.S(
+                                                1024,
+                                                axis_bx * 64
+                                                + thread_ty * 32
+                                                + thread_tx
+                                                + index_i0,
+                                            )
+                                            new_axis_vj = T.axis.S(
+                                                1024,
+                                                k_o * 32 + thread_tz * 16 + index_j0 * 4 + index_j1,
+                                            )
+                                            shared_a[new_axis_vi, new_axis_vj + 8] = match_buffer_a[
+                                                new_axis_vi, new_axis_vj
+                                            ]
 
-                                for i0, j0 in T.grid(2, 4):
-                                    for j1 in T.vectorized(0, 4):
+                                for index_i0, index_j0 in T.grid(2, 4):
+                                    for index_j1 in T.vectorized(0, 4):
                                         with T.block():
-                                            vi = T.axis.S(1024, by * 128 + ty * 64 + tx * 2 + i0)
-                                            vj = T.axis.S(1024, ko * 32 + tz * 16 + j0 * 4 + j1)
-                                            shared_B[vi, vj + 8] = B[vi, vj]
+                                            new_axis_vi = T.axis.S(
+                                                1024,
+                                                axis_by * 128
+                                                + thread_ty * 64
+                                                + thread_tx * 2
+                                                + index_i0,
+                                            )
+                                            new_axis_vj = T.axis.S(
+                                                1024,
+                                                k_o * 32 + thread_tz * 16 + index_j0 * 4 + index_j1,
+                                            )
+                                            shared_b[new_axis_vi, new_axis_vj + 8] = match_buffer_b[
+                                                new_axis_vi, new_axis_vj
+                                            ]
 
-                            for ki in range(0, 2):
-                                for i in range(0, 2):
+                            for k_i in range(0, 2):
+                                for index_i in range(0, 2):
                                     with T.block():
-                                        vi = T.axis.S(64, bx * 4 + ty * 2 + i)
-                                        vk = T.axis.S(64, ko * 2 + ki)
+                                        new_axis_vi = T.axis.S(
+                                            64, axis_bx * 4 + thread_ty * 2 + index_i
+                                        )
+                                        axis_vk = T.axis.S(64, k_o * 2 + k_i)
                                         T.reads(
-                                            shared_A[
-                                                vi * 16 : vi * 16 + 16,
-                                                vk * 16 : vk * 16 + 16 + 8,
+                                            shared_a[
+                                                new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                                axis_vk * 16 : axis_vk * 16 + 16 + 8,
                                             ]
                                         )
                                         T.writes(
-                                            wmma_A[vi * 16 : vi * 16 + 16, vk * 16 : vk * 16 + 16]
+                                            wmma_a[
+                                                new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                                axis_vk * 16 : axis_vk * 16 + 16,
+                                            ]
                                         )
-                                        s0 = T.var("int32")
-                                        s1 = T.var("int32")
-                                        A0 = T.match_buffer(
-                                            shared_A[
-                                                vi * 16 : vi * 16 + 16,
-                                                vk * 16 : vk * 16 + 16 + 8,
+                                        stride0 = T.var("int32")
+                                        stride1 = T.var("int32")
+                                        match_buffer_a0 = T.match_buffer(
+                                            shared_a[
+                                                new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                                axis_vk * 16 : axis_vk * 16 + 16 + 8,
                                             ],
                                             (16, 16 + 8),
                                             "float16",
-                                            strides=[s0, s1],
+                                            strides=[stride0, stride1],
                                             scope="shared",
                                             offset_factor=1,
                                         )
-                                        wmma_A0 = T.match_buffer(
-                                            wmma_A[vi * 16 : vi * 16 + 16, vk * 16 : vk * 16 + 16],
+                                        wmma_a0 = T.match_buffer(
+                                            wmma_a[
+                                                new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                                axis_vk * 16 : axis_vk * 16 + 16,
+                                            ],
                                             (16, 16),
                                             "float16",
                                             strides=[16, 1],
@@ -122,52 +162,60 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None:
                                         )
                                         T.evaluate(
                                             T.tvm_load_matrix_sync(
-                                                wmma_A0.data,
+                                                wmma_a0.data,
                                                 16,
                                                 16,
                                                 16,
-                                                i,
+                                                index_i,
                                                 T.tvm_access_ptr(
                                                     T.type_annotation(dtype="float16"),
-                                                    A0.data,
-                                                    A0.elem_offset + 8,
-                                                    A0.strides[0],
+                                                    match_buffer_a0.data,
+                                                    match_buffer_a0.elem_offset + 8,
+                                                    match_buffer_a0.strides[0],
                                                     1,
                                                     dtype="handle",
                                                 ),
-                                                A0.strides[0],
+                                                match_buffer_a0.strides[0],
                                                 "row_major",
                                                 dtype="handle",
                                             )
                                         )
-                                for j in range(0, 4):
+                                for index_jj in range(0, 4):
                                     with T.block():
-                                        vj = T.axis.S(64, by * 8 + tz * 4 + j)
-                                        vk = T.axis.S(64, ko * 2 + ki)
+                                        new_axis_vj = T.axis.S(
+                                            64, axis_by * 8 + thread_tz * 4 + index_jj
+                                        )
+                                        axis_vk = T.axis.S(64, k_o * 2 + k_i)
                                         T.reads(
-                                            shared_B[
-                                                vj * 16 : vj * 16 + 16,
-                                                vk * 16 : vk * 16 + 16 + 8,
+                                            shared_b[
+                                                new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                                axis_vk * 16 : axis_vk * 16 + 16 + 8,
                                             ]
                                         )
                                         T.writes(
-                                            wmma_B[vj * 16 : vj * 16 + 16, vk * 16 : vk * 16 + 16]
+                                            wmma_b[
+                                                new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                                axis_vk * 16 : axis_vk * 16 + 16,
+                                            ]
                                         )
-                                        s0 = T.var("int32")
-                                        s1 = T.var("int32")
-                                        B0 = T.match_buffer(
-                                            shared_B[
-                                                vj * 16 : vj * 16 + 16,
-                                                vk * 16 : vk * 16 + 16 + 8,
+                                        stride0 = T.var("int32")
+                                        stride1 = T.var("int32")
+                                        match_buffer_b0 = T.match_buffer(
+                                            shared_b[
+                                                new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                                axis_vk * 16 : axis_vk * 16 + 16 + 8,
                                             ],
                                             (16, 16 + 8),
                                             "float16",
-                                            strides=[s0, s1],
+                                            strides=[stride0, stride1],
                                             scope="shared",
                                             offset_factor=1,
                                         )
-                                        wmma_B0 = T.match_buffer(
-                                            wmma_B[vj * 16 : vj * 16 + 16, vk * 16 : vk * 16 + 16],
+                                        wmma_b0 = T.match_buffer(
+                                            wmma_b[
+                                                new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                                axis_vk * 16 : axis_vk * 16 + 16,
+                                            ],
                                             (16, 16),
                                             "float16",
                                             strides=[16, 1],
@@ -176,63 +224,82 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None:
                                         )
                                         T.evaluate(
                                             T.tvm_load_matrix_sync(
-                                                wmma_B0.data,
+                                                wmma_b0.data,
                                                 16,
                                                 16,
                                                 16,
-                                                j,
+                                                index_jj,
                                                 T.tvm_access_ptr(
                                                     T.type_annotation(dtype="float16"),
-                                                    B0.data,
-                                                    B0.elem_offset + 8,
-                                                    B0.strides[0],
+                                                    match_buffer_b0.data,
+                                                    match_buffer_b0.elem_offset + 8,
+                                                    match_buffer_b0.strides[0],
                                                     1,
                                                     dtype="handle",
                                                 ),
-                                                B0.strides[0],
+                                                match_buffer_b0.strides[0],
                                                 "col_major",
                                                 dtype="handle",
                                             )
                                         )
-                                for i, j in T.grid(2, 4):
+                                for index_i, index_jj in T.grid(2, 4):
                                     with T.block():
-                                        vi = T.axis.S(64, bx * 4 + ty * 2 + i)
-                                        vj = T.axis.S(64, by * 8 + tz * 4 + j)
-                                        vk = T.axis.R(64, ko * 2 + ki)
+                                        new_axis_vi = T.axis.S(
+                                            64, axis_bx * 4 + thread_ty * 2 + index_i
+                                        )
+                                        new_axis_vj = T.axis.S(
+                                            64, axis_by * 8 + thread_tz * 4 + index_jj
+                                        )
+                                        axis_vk = T.axis.R(64, k_o * 2 + k_i)
                                         T.reads(
                                             [
-                                                wmma_A[
-                                                    vi * 16 : vi * 16 + 16, vk * 16 : vk * 16 + 16
+                                                wmma_a[
+                                                    new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                                    axis_vk * 16 : axis_vk * 16 + 16,
                                                 ],
-                                                wmma_B[
-                                                    vj * 16 : vj * 16 + 16, vk * 16 : vk * 16 + 16
+                                                wmma_b[
+                                                    new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                                    axis_vk * 16 : axis_vk * 16 + 16,
                                                 ],
-                                                wmma_C[
-                                                    vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16
+                                                wmma_c[
+                                                    new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                                    new_axis_vj * 16 : new_axis_vj * 16 + 16,
                                                 ],
                                             ]
                                         )
                                         T.writes(
-                                            wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16]
+                                            wmma_c[
+                                                new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                                new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                            ]
                                         )
-                                        wmma_A1 = T.match_buffer(
-                                            wmma_A[vi * 16 : vi * 16 + 16, vk * 16 : vk * 16 + 16],
+                                        wmma_a1 = T.match_buffer(
+                                            wmma_a[
+                                                new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                                axis_vk * 16 : axis_vk * 16 + 16,
+                                            ],
                                             (16, 16),
                                             "float16",
                                             strides=[16, 1],
                                             scope="wmma.matrix_a",
                                             offset_factor=1,
                                         )
-                                        wmma_B1 = T.match_buffer(
-                                            wmma_B[vj * 16 : vj * 16 + 16, vk * 16 : vk * 16 + 16],
+                                        wmma_b1 = T.match_buffer(
+                                            wmma_b[
+                                                new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                                axis_vk * 16 : axis_vk * 16 + 16,
+                                            ],
                                             (16, 16),
                                             "float16",
                                             strides=[16, 1],
                                             scope="wmma.matrix_b",
                                             offset_factor=1,
                                         )
-                                        wmma_C1 = T.match_buffer(
-                                            wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16],
+                                        wmma_c1 = T.match_buffer(
+                                            wmma_c[
+                                                new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                                new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                            ],
                                             (16, 16),
                                             "float32",
                                             strides=[16 * 4, 1],
@@ -241,56 +308,72 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None:
                                         )
                                         T.evaluate(
                                             T.tvm_mma_sync(
-                                                wmma_C1.data,
-                                                i * 4 + j,
-                                                wmma_A1.data,
-                                                i,
-                                                wmma_B1.data,
-                                                j,
-                                                wmma_C1.data,
-                                                i * 4 + j,
+                                                wmma_c1.data,
+                                                index_i * 4 + index_jj,
+                                                wmma_a1.data,
+                                                index_i,
+                                                wmma_b1.data,
+                                                index_jj,
+                                                wmma_c1.data,
+                                                index_i * 4 + index_jj,
                                                 dtype="handle",
                                             )
                                         )
-                        for i, j in T.grid(2, 4):
+                        for index_i, index_jj in T.grid(2, 4):
                             with T.block():
-                                vi = T.axis.S(64, bx * 4 + ty * 2 + i)
-                                vj = T.axis.S(64, by * 8 + tz * 4 + j)
-                                T.reads(wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
-                                T.writes(C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
-                                s0 = T.var("int32")
-                                s1 = T.var("int32")
-                                wmma_C2 = T.match_buffer(
-                                    wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16],
+                                new_axis_vi = T.axis.S(64, axis_bx * 4 + thread_ty * 2 + index_i)
+                                new_axis_vj = T.axis.S(64, axis_by * 8 + thread_tz * 4 + index_jj)
+                                T.reads(
+                                    wmma_c[
+                                        new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                        new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                    ]
+                                )
+                                T.writes(
+                                    match_buffer_c[
+                                        new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                        new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                    ]
+                                )
+                                stride0 = T.var("int32")
+                                stride1 = T.var("int32")
+                                wmma_c2 = T.match_buffer(
+                                    wmma_c[
+                                        new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                        new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                    ],
                                     (16, 16),
                                     "float32",
                                     strides=[16 * 4, 1],
                                     scope="wmma.accumulator",
                                     offset_factor=1,
                                 )
-                                C1 = T.match_buffer(
-                                    C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16],
+                                match_buffer_c1 = T.match_buffer(
+                                    match_buffer_c[
+                                        new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                        new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                    ],
                                     (16, 16),
                                     "float32",
-                                    strides=[s0, s1],
+                                    strides=[stride0, stride1],
                                     offset_factor=1,
                                 )
                                 T.evaluate(
                                     T.tvm_store_matrix_sync(
-                                        wmma_C2.data,
+                                        wmma_c2.data,
                                         16,
                                         16,
                                         16,
-                                        i * 4 + j,
+                                        index_i * 4 + index_jj,
                                         T.tvm_access_ptr(
                                             T.type_annotation(dtype="float32"),
-                                            C1.data,
-                                            C1.elem_offset,
-                                            C1.strides[0],
+                                            match_buffer_c1.data,
+                                            match_buffer_c1.elem_offset,
+                                            match_buffer_c1.strides[0],
                                             1,
                                             dtype="handle",
                                         ),
-                                        C1.strides[0],
+                                        match_buffer_c1.strides[0],
                                         "row_major",
                                         dtype="handle",
                                     )
@@ -299,22 +382,23 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None:
 
 @tvm.testing.requires_cuda
 def test_gemm_tensorcore():
+    """Test running gemm on tensorcore."""
     dev = tvm.device("cuda", 0)
     a_np = np.random.uniform(size=(1024, 1024)).astype("float16")
     b_np = np.random.uniform(size=(1024, 1024)).astype("float16")
     c_np = np.dot(a_np.astype("float32"), b_np.T.astype("float32"))
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(b_np, dev)
-    c = tvm.nd.array(np.zeros((1024, 1024), dtype="float32"), dev)
-    f = tvm.build(tensorcore_gemm, target="cuda", name="dense")
-    f(a, b, c)
-    tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-3)
+    buff_a = tvm.nd.array(a_np, dev)
+    buff_b = tvm.nd.array(b_np, dev)
+    buff_c = tvm.nd.array(np.zeros((1024, 1024), dtype="float32"), dev)
+    myfunc = tvm.build(tensorcore_gemm, target="cuda", name="dense")
+    myfunc(buff_a, buff_b, buff_c)
+    tvm.testing.assert_allclose(buff_c.numpy(), c_np, rtol=1e-3)
 
-    evaluator = f.time_evaluator(f.entry_name, dev, number=100)
-    t = evaluator(a, b, c).mean
+    evaluator = myfunc.time_evaluator(myfunc.entry_name, dev, number=100)
+    time_elapsed = evaluator(buff_a, buff_b, buff_c).mean
     num_flops = 2 * 1024 * 1024 * 1024
-    gflops = num_flops / (t * 1e3) / 1e6
-    print("gemm with tensor core: %f ms" % (t * 1e3))
+    gflops = num_flops / (time_elapsed * 1e3) / 1e6
+    print("gemm with tensor core: %f ms" % (time_elapsed * 1e3))
     print("GFLOPS: %f" % gflops)
 
 
diff --git a/tests/python/integration/test_meta_schedule_auto_tensorize.py b/tests/python/integration/test_meta_schedule_auto_tensorize.py
index 511e75723b03..b855dc6fa09e 100644
--- a/tests/python/integration/test_meta_schedule_auto_tensorize.py
+++ b/tests/python/integration/test_meta_schedule_auto_tensorize.py
@@ -14,34 +14,32 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Integration test for metascheduler's auto tensorization."""
+import tempfile
+
+import numpy as np
 import pytest
+
 import tvm
-from tvm import relay
 import tvm.testing
-import numpy as np
-from tvm.meta_schedule.tune import tune_extracted_tasks
+import tvm.topi.testing
+from tvm import meta_schedule as ms
+from tvm import relay
+from tvm.meta_schedule import ApplyHistoryBest, postproc, schedule_rule
 from tvm.meta_schedule.relay_integration import extract_task_from_relay
-from tvm.meta_schedule import ApplyHistoryBest
-from tvm.meta_schedule import schedule_rule, postproc
 from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base
-from tvm import meta_schedule as ms
-from tvm.tir.tensor_intrin import (
-    VNNI_DOT_16x4_INTRIN as VNNI_INTRIN,
-    DP4A_INTRIN,
-    AMDGPU_SDOT4_INTRIN,
-)
-import tempfile
-import tvm.topi.testing
-
+from tvm.meta_schedule.tune import tune_extracted_tasks
+from tvm.tir.tensor_intrin import AMDGPU_SDOT4_INTRIN, DP4A_INTRIN
+from tvm.tir.tensor_intrin import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
 
-config = ms.TuneConfig(
+CONFIG = ms.TuneConfig(
     strategy="evolutionary",
     num_trials_per_iter=32,
     max_trials_per_task=32,
     max_trials_global=20000,
 )
 
-sch_rules_for_vnni = [
+SCH_RULES_FOR_VNNI = [
     schedule_rule.AutoInline(
         into_producer=False,
         into_consumer=True,
@@ -113,17 +111,17 @@ def get_sch_rules_for_dp4a(intrin):
     ]
 
 
-sch_rules_for_dp4a = get_sch_rules_for_dp4a(DP4A_INTRIN)
-sch_rules_for_sdot4 = get_sch_rules_for_dp4a(AMDGPU_SDOT4_INTRIN)
+SCH_RULES_FOR_DP4A = get_sch_rules_for_dp4a(DP4A_INTRIN)
+SCH_RULES_FOR_SDOT4 = get_sch_rules_for_dp4a(AMDGPU_SDOT4_INTRIN)
 
-postprocs_for_vnni = [
+POSTPROCS_FOR_VNNI = [
     postproc.DisallowDynamicLoop(),
     postproc.RewriteParallelVectorizeUnroll(),
     postproc.RewriteReductionBlock(),
     postproc.RewriteTensorize(vectorize_init_loop=True),
 ]
 
-postprocs_for_dp4a = [
+POSTPROCS_FOR_DP4A = [
     postproc.DisallowDynamicLoop(),
     postproc.RewriteCooperativeFetch(),
     postproc.RewriteUnboundBlock(),
@@ -135,6 +133,7 @@ def get_sch_rules_for_dp4a(intrin):
 
 
 def tune_and_test(relay_mod, data_np, weight_np, op_name, target, sch_rules, postprocs):
+    """Test tuning."""
     tgt = "cuda" if "nvidia" in target else target
     dev = tvm.device(tgt, 0)
 
@@ -158,7 +157,7 @@ def tune_and_test(relay_mod, data_np, weight_np, op_name, target, sch_rules, pos
     with tempfile.TemporaryDirectory() as work_dir:
         database = tune_extracted_tasks(
             tune_tasks,
-            config,
+            CONFIG,
             work_dir=work_dir,
             sch_rules=lambda: sch_rules,
             postprocs=lambda: postprocs,
@@ -186,9 +185,9 @@ def tune_and_test(relay_mod, data_np, weight_np, op_name, target, sch_rules, pos
 
 
 def _test_dense(data_dtype, sch_rules, postprocs, target):
-    M, N, K = 1024, 1024, 1024
-    data_shape = (M, K)
-    weight_shape = (N, K)
+    dim_m, dim_n, dim_k = 1024, 1024, 1024
+    data_shape = (dim_m, dim_k)
+    weight_shape = (dim_n, dim_k)
 
     weight_dtype = "int8"
     out_dtype = "int32"
@@ -255,7 +254,7 @@ def _test_bert_int8(target, sch_rules, postprocs):
     with tempfile.TemporaryDirectory() as work_dir:
         database = tune_extracted_tasks(
             tune_tasks,
-            config,
+            CONFIG,
             work_dir=work_dir,
             sch_rules=lambda: sch_rules,
             postprocs=lambda: postprocs,
@@ -284,14 +283,14 @@ def _test_bert_int8(target, sch_rules, postprocs):
 @pytest.mark.skip("Requires cascadelake")
 def test_vnni_dense():
     _test_dense(
-        "uint8", sch_rules_for_vnni, postprocs_for_vnni, "llvm -mcpu=cascadelake -num-cores 4"
+        "uint8", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI, "llvm -mcpu=cascadelake -num-cores 4"
     )
 
 
 @pytest.mark.skip("Only tested locally on sm_86 (for cuda) which is not supported by CI")
 @tvm.testing.requires_gpu
 def test_dp4a_dense():
-    _test_dense("int8", sch_rules_for_dp4a, postprocs_for_dp4a, "nvidia/geforce-rtx-3070")
+    _test_dense("int8", SCH_RULES_FOR_DP4A, POSTPROCS_FOR_DP4A, "nvidia/geforce-rtx-3070")
 
     # Uncomment to test on vulkan or rocm target
     # _test_dense(
@@ -305,14 +304,14 @@ def test_dp4a_dense():
 @pytest.mark.skip("Requires cascadelake")
 def test_vnni_conv2d():
     _test_conv2d(
-        "uint8", sch_rules_for_vnni, postprocs_for_vnni, "llvm -mcpu=cascadelake -num-cores 4"
+        "uint8", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI, "llvm -mcpu=cascadelake -num-cores 4"
     )
 
 
 @pytest.mark.skip("Only tested locally on sm_86 (for cuda) which is not supported by CI")
 @tvm.testing.requires_gpu
 def test_dp4a_conv2d():
-    _test_conv2d("int8", sch_rules_for_dp4a, postprocs_for_dp4a, "nvidia/geforce-rtx-3070")
+    _test_conv2d("int8", SCH_RULES_FOR_DP4A, POSTPROCS_FOR_DP4A, "nvidia/geforce-rtx-3070")
 
     # Uncomment to test on vulkan or rocm target
     # _test_conv2d(
@@ -325,13 +324,13 @@ def test_dp4a_conv2d():
 
 @pytest.mark.skip("Requires cascadelake")
 def test_vnni_bert_int8():
-    _test_bert_int8("llvm -mcpu=cascadelake -num-cores 4", sch_rules_for_vnni, postprocs_for_vnni)
+    _test_bert_int8("llvm -mcpu=cascadelake -num-cores 4", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI)
 
 
 @tvm.testing.requires_gpu
 @pytest.mark.skip("Slow on CI")
 def test_dp4a_bert_int8():
-    _test_bert_int8("nvidia/geforce-rtx-3070", sch_rules_for_dp4a, postprocs_for_dp4a)
+    _test_bert_int8("nvidia/geforce-rtx-3070", SCH_RULES_FOR_DP4A, POSTPROCS_FOR_DP4A)
 
     # Uncomment to test on vulkan or rocm target
     # _test_bert_int8("vulkan -from_device=0", sch_rules_for_dp4a, postprocs_for_dp4a)
diff --git a/tests/python/integration/test_reduce.py b/tests/python/integration/test_reduce.py
index f3886374ccb6..eaac8ed26684 100644
--- a/tests/python/integration/test_reduce.py
+++ b/tests/python/integration/test_reduce.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Test scheduling of reduction operations."""
 import pytest
 import numpy as np
 
@@ -26,22 +27,28 @@
 
 @tvm.testing.requires_gpu
 def test_reduce_prims():
+    """Test reduction operations."""
+
     def test_prim(reducer, np_reducer):
         # graph
-        n = tvm.te.size_var("n")
-        m = tvm.te.size_var("m")
-        A = te.placeholder((n, m), name="A")
-        R = te.compute((n,), lambda i: tvm.tir.Select((i > 1), 1, 0), name="R")
-        k = te.reduce_axis((0, m))
-        B = te.compute((n,), lambda i: reducer(A[i, k], axis=k, where=(R[i] == 1)), name="B")
+        size_var_n = tvm.te.size_var("n")
+        size_var_m = tvm.te.size_var("m")
+        placeholder_a = te.placeholder((size_var_n, size_var_m), name="A")
+        result_r = te.compute((size_var_n,), lambda i: tvm.tir.Select((i > 1), 1, 0), name="R")
+        axis_k = te.reduce_axis((0, size_var_m))
+        result_b = te.compute(
+            (size_var_n,),
+            lambda i: reducer(placeholder_a[i, axis_k], axis=axis_k, where=(result_r[i] == 1)),
+            name="B",
+        )
         # schedule
-        s = te.create_schedule(B.op)
+        schedule = te.create_schedule(result_b.op)
         # create iter var and assign them tags.
         num_thread = 1
-        xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
-        s[B].bind(xo, te.thread_axis("blockIdx.x"))
-        s[B].bind(xi, te.thread_axis("threadIdx.x"))
-        s[R].compute_inline()
+        axis_x0, axis_x1 = schedule[result_b].split(result_b.op.axis[0], factor=num_thread)
+        schedule[result_b].bind(axis_x0, te.thread_axis("blockIdx.x"))
+        schedule[result_b].bind(axis_x1, te.thread_axis("threadIdx.x"))
+        schedule[result_r].compute_inline()
 
         # one line to build the function.
         def check_device(device, host="llvm"):
@@ -50,17 +57,22 @@ def check_device(device, host="llvm"):
                 print("skip because %s is not enabled.." % device)
                 return
             freduce = tvm.build(
-                s, args=[A, B], target=tvm.target.Target(device, host), name="myreduce"
+                schedule,
+                args=[placeholder_a, result_b],
+                target=tvm.target.Target(device, host),
+                name="myreduce",
             )
             # launch the kernel.
-            n = 1028
-            m = 129
-            x = tvm.nd.array(np.random.uniform(size=(n, m)).astype(A.dtype), dev)
-            y = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev)
-            freduce(x, y)
-            npy = y.numpy()
+            num_n = 1028
+            num_m = 129
+            buff_x = tvm.nd.array(
+                np.random.uniform(size=(num_n, num_m)).astype(placeholder_a.dtype), dev
+            )
+            buff_y = tvm.nd.array(np.zeros(num_n, dtype=result_b.dtype), dev)
+            freduce(buff_x, buff_y)
+            npy = buff_y.numpy()
             npy[:2] = 0
-            res = np_reducer(x.numpy(), axis=1)
+            res = np_reducer(buff_x.numpy(), axis=1)
             res[:2] = 0
             tvm.testing.assert_allclose(npy, res, rtol=1e-4)
 
@@ -76,192 +88,228 @@ def check_device(device, host="llvm"):
 
 
 def test_init_imm():
-    n = tvm.runtime.convert(1027)
-    A = te.placeholder((n,), name="A")
-    k = te.reduce_axis((0, n))
-    B = te.compute((), lambda: te.sum(A[k], axis=k, init=10.0), name="B")
+    """Test initial values which are immutable in reduction ops."""
+    num_n = 1027
+    arr_length = tvm.runtime.convert(num_n)
+    placeholder_a = te.placeholder((arr_length,), name="A")
+    axis_k = te.reduce_axis((0, arr_length))
+    result_b = te.compute(
+        (), lambda: te.sum(placeholder_a[axis_k], axis=axis_k, init=10.0), name="B"
+    )
     # schedule
-    s = te.create_schedule(B.op)
+    schedule_s = te.create_schedule(result_b.op)
     # one line to build the function.
     def check_target(target="llvm"):
         if not tvm.runtime.enabled(target):
             return
         dev = tvm.cpu(0)
-        fapi = tvm.lower(s, args=[A, B])
+        fapi = tvm.lower(schedule_s, args=[placeholder_a, result_b])
         fsum = tvm.build(fapi, target=target, name="mysum")
         # launch the kernel.
-        n = 1027
-        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros((), dtype=B.dtype), dev)
-        fsum(a, b)
-        res = 10.0 + np.sum(a.numpy(), axis=0)
-        tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4)
+        buff_a = tvm.nd.array(np.random.uniform(size=(num_n,)).astype(placeholder_a.dtype), dev)
+        buff_b = tvm.nd.array(np.zeros((), dtype=result_b.dtype), dev)
+        fsum(buff_a, buff_b)
+        res = 10.0 + np.sum(buff_a.numpy(), axis=0)
+        tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4)
 
     check_target()
 
 
 def test_init():
-    n = tvm.runtime.convert(1027)
-    A = te.placeholder((n, n), name="A")
-    C = te.placeholder((n, n), name="C")
-    I = te.placeholder((n, n), name="I")
-    k = te.reduce_axis((0, n))
-    B = te.compute((n, n), lambda i, j: te.sum(A[i, k] * C[k, j], axis=k, init=I[i, j]), name="B")
+    """Test initializer which is non-const."""
+    num_n = 1027
+    arr_length = tvm.runtime.convert(num_n)
+    placeholder_a = te.placeholder((arr_length, arr_length), name="A")
+    placeholder_c = te.placeholder((arr_length, arr_length), name="C")
+    placeholder_i = te.placeholder((arr_length, arr_length), name="I")
+    axis_k = te.reduce_axis((0, arr_length))
+    result_b = te.compute(
+        (arr_length, arr_length),
+        lambda i, j: te.sum(
+            placeholder_a[i, axis_k] * placeholder_c[axis_k, j],
+            axis=axis_k,
+            init=placeholder_i[i, j],
+        ),
+        name="B",
+    )
 
     # schedule
-    s = te.create_schedule(B.op)
+    schedule = te.create_schedule(result_b.op)
     # one line to build the function.
     def check_target(target="llvm"):
         if not tvm.runtime.enabled(target):
             return
         dev = tvm.cpu(0)
-        fapi = tvm.lower(s, args=[A, C, I, B])
+        fapi = tvm.lower(schedule, args=[placeholder_a, placeholder_c, placeholder_i, result_b])
         print(fapi)
         mmult = tvm.build(fapi, target=target, name="mmult")
         # launch the kernel.
-        n = 1027
-        a = tvm.nd.array(np.random.uniform(size=(n, n)).astype(A.dtype), dev)
-        c = tvm.nd.array(np.random.uniform(size=(n, n)).astype(C.dtype), dev)
-        ii = tvm.nd.array(np.random.uniform(size=(n, n)).astype(B.dtype), dev)
-        b = tvm.nd.array(np.zeros((n, n), dtype=B.dtype), dev)
-        mmult(a, c, ii, b)
-        res = ii.numpy() + np.matmul(a.numpy(), c.numpy())
-        tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4)
+        buff_a = tvm.nd.array(
+            np.random.uniform(size=(num_n, num_n)).astype(placeholder_a.dtype), dev
+        )
+        buff_c = tvm.nd.array(
+            np.random.uniform(size=(num_n, num_n)).astype(placeholder_c.dtype), dev
+        )
+        buff_i = tvm.nd.array(np.random.uniform(size=(num_n, num_n)).astype(result_b.dtype), dev)
+        buf_b = tvm.nd.array(np.zeros((num_n, num_n), dtype=result_b.dtype), dev)
+        mmult(buff_a, buff_c, buff_i, buf_b)
+        res = buff_i.numpy() + np.matmul(buff_a.numpy(), buff_c.numpy())
+        tvm.testing.assert_allclose(buf_b.numpy(), res, rtol=1e-4)
 
     check_target()
 
 
 def test_rfactor():
-    n = tvm.runtime.convert(1027)
-    A = te.placeholder((n,), name="A")
-    k = te.reduce_axis((0, n))
-    B = te.compute((), lambda: te.sum(A[k], axis=k), name="B")
+    """Test rfactors."""
+    num_n = 1027
+    arr_length = tvm.runtime.convert(num_n)
+    placeholder_a = te.placeholder((arr_length,), name="A")
+    axis_k = te.reduce_axis((0, arr_length))
+    placeholder_b = te.compute((), lambda: te.sum(placeholder_a[axis_k], axis=axis_k), name="B")
     # schedule
-    s = te.create_schedule(B.op)
-    kf, ki = s[B].split(k, nparts=4)
-    BF = s.rfactor(B, kf)
-    s[BF].parallel(BF.op.axis[0])
+    schedule = te.create_schedule(placeholder_b.op)
+    axis_kf, _ = schedule[placeholder_b].split(axis_k, nparts=4)
+    rfactor_bf = schedule.rfactor(placeholder_b, axis_kf)
+    schedule[rfactor_bf].parallel(rfactor_bf.op.axis[0])
     # one line to build the function.
     def check_target(target="llvm"):
         if not tvm.testing.device_enabled(target):
             return
         dev = tvm.cpu(0)
-        fapi = tvm.lower(s, args=[A, B])
+        fapi = tvm.lower(schedule, args=[placeholder_a, placeholder_b])
         fsum = tvm.build(fapi, target=target, name="mysum")
         # launch the kernel.
-        n = 1027
-        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros((), dtype=B.dtype), dev)
-        fsum(a, b)
-        res = np.sum(a.numpy(), axis=0)
-        tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4)
+        buff_a = tvm.nd.array(np.random.uniform(size=(num_n,)).astype(placeholder_a.dtype), dev)
+        buff_b = tvm.nd.array(np.zeros((), dtype=placeholder_b.dtype), dev)
+        fsum(buff_a, buff_b)
+        res = np.sum(buff_a.numpy(), axis=0)
+        tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4)
 
     check_target()
 
 
 def test_rfactor_init():
-    n = tvm.runtime.convert(1027)
-    A = te.placeholder((n, n), name="A")
-    C = te.placeholder((n, n), name="C")
-    I = te.placeholder((n, n), name="I")
-    k = te.reduce_axis((0, n))
-    B = te.compute((n, n), lambda i, j: te.sum(A[i, k] * C[k, j], axis=k, init=I[i, j]), name="B")
+    """Test rfactors with constant inits."""
+    num_n = 1027
+    arr_length = tvm.runtime.convert(num_n)
+    placeholder_a = te.placeholder((arr_length, arr_length), name="A")
+    placeholder_c = te.placeholder((arr_length, arr_length), name="C")
+    placeholder_i = te.placeholder((arr_length, arr_length), name="I")
+    axis_k = te.reduce_axis((0, arr_length))
+    result_b = te.compute(
+        (arr_length, arr_length),
+        lambda i, j: te.sum(
+            placeholder_a[i, axis_k] * placeholder_c[axis_k, j],
+            axis=axis_k,
+            init=placeholder_i[i, j],
+        ),
+        name="B",
+    )
 
     # schedule
-    s = te.create_schedule(B.op)
-    kf, ki = s[B].split(k, nparts=4)
-    BF = s.rfactor(B, kf, 1)
-    s[BF].parallel(BF.op.axis[0])
+    schedule = te.create_schedule(result_b.op)
+    axis_kf, _ = schedule[result_b].split(axis_k, nparts=4)
+    rfactor_bf = schedule.rfactor(result_b, axis_kf, 1)
+    schedule[rfactor_bf].parallel(rfactor_bf.op.axis[0])
     # one line to build the function.
     def check_target(target="llvm"):
         if not tvm.runtime.enabled(target):
             return
         dev = tvm.cpu(0)
-        fapi = tvm.lower(s, args=[A, C, I, B])
+        fapi = tvm.lower(schedule, args=[placeholder_a, placeholder_c, placeholder_i, result_b])
         print(fapi)
         mmult = tvm.build(fapi, target=target, name="mmult")
         # launch the kernel.
-        n = 1027
-        a = tvm.nd.array(np.random.uniform(size=(n, n)).astype(A.dtype), dev)
-        c = tvm.nd.array(np.random.uniform(size=(n, n)).astype(C.dtype), dev)
-        ii = tvm.nd.array(np.random.uniform(size=(n, n)).astype(B.dtype), dev)
-        b = tvm.nd.array(np.zeros((n, n), dtype=B.dtype), dev)
-        mmult(a, c, ii, b)
-        res = ii.numpy() + np.matmul(a.numpy(), c.numpy())
-        tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4)
+        buff_a = tvm.nd.array(
+            np.random.uniform(size=(num_n, num_n)).astype(placeholder_a.dtype), dev
+        )
+        buff_c = tvm.nd.array(
+            np.random.uniform(size=(num_n, num_n)).astype(placeholder_c.dtype), dev
+        )
+        buff_i = tvm.nd.array(np.random.uniform(size=(num_n, num_n)).astype(result_b.dtype), dev)
+        buff_b = tvm.nd.array(np.zeros((num_n, num_n), dtype=result_b.dtype), dev)
+        mmult(buff_a, buff_c, buff_i, buff_b)
+        res = buff_i.numpy() + np.matmul(buff_a.numpy(), buff_c.numpy())
+        tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4)
 
     check_target()
 
 
 def test_rfactor_factor_axis():
-    n = tvm.runtime.convert(1027)
-    A = te.placeholder((n,), name="A")
-    k = te.reduce_axis((0, n))
-    B = te.compute((), lambda: te.sum(A[k], axis=k), name="B")
+    """Test rfactors across axis."""
+    num_n = 1027
+    arr_length = tvm.runtime.convert(num_n)
+    placeholder_a = te.placeholder((arr_length,), name="A")
+    axis_k = te.reduce_axis((0, arr_length))
+    placeholder_b = te.compute((), lambda: te.sum(placeholder_a[axis_k], axis=axis_k), name="B")
     # schedule
-    s = te.create_schedule(B.op)
-    kf, ki = s[B].split(k, nparts=4)
-    BF = s.rfactor(B, kf, 0)
-    s[BF].parallel(BF.op.axis[0])
+    schedule = te.create_schedule(placeholder_b.op)
+    axis_kf, _ = schedule[placeholder_b].split(axis_k, nparts=4)
+    rfactor_bf = schedule.rfactor(placeholder_b, axis_kf, 0)
+    schedule[rfactor_bf].parallel(rfactor_bf.op.axis[0])
     # one line to build the function.
     def check_target(target="llvm"):
         if not tvm.testing.device_enabled(target):
             return
         dev = tvm.cpu(0)
-        fapi = tvm.lower(s, args=[A, B])
+        fapi = tvm.lower(schedule, args=[placeholder_a, placeholder_b])
         fsum = tvm.build(fapi, target=target, name="mysum")
         # launch the kernel.
-        n = 1027
-        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros((), dtype=B.dtype), dev)
-        fsum(a, b)
-        res = np.sum(a.numpy(), axis=0)
-        tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4)
+        buff_a = tvm.nd.array(np.random.uniform(size=(num_n,)).astype(placeholder_a.dtype), dev)
+        buff_b = tvm.nd.array(np.zeros((), dtype=placeholder_b.dtype), dev)
+        fsum(buff_a, buff_b)
+        res = np.sum(buff_a.numpy(), axis=0)
+        tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4)
 
     check_target()
 
 
 @tvm.testing.requires_gpu
 def test_rfactor_threads():
-    nn = 1027
-    mm = 10
-    n = tvm.runtime.convert(nn)
-    m = tvm.runtime.convert(mm)
-    A = te.placeholder((m, n), name="A")
-    k = te.reduce_axis((0, n))
+    """Test rfactors across threads."""
+    num_n = 1027
+    num_m = 10
+    length_n = tvm.runtime.convert(num_n)
+    length_m = tvm.runtime.convert(num_m)
+    placeholder_a = te.placeholder((length_m, length_n), name="A")
+    axis_k = te.reduce_axis((0, length_n))
     nthread = 16
-    B = te.compute((m,), lambda i: te.sum(A[i, k], axis=k, where=(i > 1)), name="B")
+    result_b = te.compute(
+        (length_m,),
+        lambda i: te.sum(placeholder_a[i, axis_k], axis=axis_k, where=(i > 1)),
+        name="B",
+    )
     # schedule
-    s = te.create_schedule(B.op)
-    ko, kf = s[B].split(k, factor=nthread)
-    BF = s.rfactor(B, kf)
-    bx, ty = s[B].split(s[B].op.axis[0], factor=nthread)
-    s[B].bind(bx, te.thread_axis("blockIdx.x"))
-    s[B].bind(ty, te.thread_axis("threadIdx.y"))
-    tx = s[B].op.reduce_axis[0]
+    schedule = te.create_schedule(result_b.op)
+    _, axis_kf = schedule[result_b].split(axis_k, factor=nthread)
+    rfactor_bf = schedule.rfactor(result_b, axis_kf)
+    axis_bx, axis_ty = schedule[result_b].split(schedule[result_b].op.axis[0], factor=nthread)
+    schedule[result_b].bind(axis_bx, te.thread_axis("blockIdx.x"))
+    schedule[result_b].bind(axis_ty, te.thread_axis("threadIdx.y"))
+    axis_tx = schedule[result_b].op.reduce_axis[0]
     thread_x = te.thread_axis("threadIdx.x")
-    s[B].bind(tx, thread_x)
-    s[BF].compute_at(s[B], tx)
-    s[B].set_store_predicate(thread_x.var.equal(0))
+    schedule[result_b].bind(axis_tx, thread_x)
+    schedule[rfactor_bf].compute_at(schedule[result_b], axis_tx)
+    schedule[result_b].set_store_predicate(thread_x.var.equal(0))
 
     # one line to build the function.
-    def check_target(device, host="stackvm"):
+    def check_target(device):
         dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("skip because %s is not enabled.." % device)
             return
 
-        fapi = tvm.lower(s, args=[A, B])
+        fapi = tvm.lower(schedule, args=[placeholder_a, result_b])
         fsum = tvm.build(fapi, target=device, name="mysum")
         # launch the kernel.
-        n = nn
-        m = mm
-        a = tvm.nd.array(np.random.uniform(size=(m, n)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(m, dtype=B.dtype), dev)
-        fsum(a, b)
-        res = np.sum(a.numpy(), axis=1)
+        buff_a = tvm.nd.array(
+            np.random.uniform(size=(num_m, num_n)).astype(placeholder_a.dtype), dev
+        )
+        buff_b = tvm.nd.array(np.zeros(num_m, dtype=result_b.dtype), dev)
+        fsum(buff_a, buff_b)
+        res = np.sum(buff_a.numpy(), axis=1)
         res[:2] = 0
-        tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4)
+        tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4)
 
     check_target("vulkan")
     check_target("cuda")
@@ -272,46 +320,51 @@ def check_target(device, host="stackvm"):
 
 @tvm.testing.requires_gpu
 def test_rfactor_elemwise_threads():
-    n = 1025
-    m = 10
-    A = te.placeholder((m, n), name="A")
-    k = te.reduce_axis((0, n))
+    """Test rfactor elemwise threads."""
+    num_n = 1025
+    num_m = 10
+    placeholder_a = te.placeholder((num_m, num_n), name="A")
+    axis_k = te.reduce_axis((0, num_n))
     nthread = 16
-    B = te.compute((m,), lambda i: te.sum(A[i, k], axis=k), name="B")
-    BB = te.compute((m,), lambda i: B[i] + 1, name="BB")
-    C = te.compute((m,), lambda i: BB[i] + 1, name="C")
+    result_b = te.compute(
+        (num_m,), lambda i: te.sum(placeholder_a[i, axis_k], axis=axis_k), name="B"
+    )
+    result_bb = te.compute((num_m,), lambda i: result_b[i] + 1, name="BB")
+    result_c = te.compute((num_m,), lambda i: result_bb[i] + 1, name="C")
     # schedule
-    s = te.create_schedule(C.op)
-    s[BB].compute_inline()
-    bx, ty = s[C].split(s[C].op.axis[0], factor=nthread)
-    ko, kf = s[B].split(k, factor=nthread)
-    BF = s.rfactor(B, kf)
-    s[B].compute_at(s[C], ty)
-    s[C].bind(bx, te.thread_axis("blockIdx.x"))
-    s[C].bind(ty, te.thread_axis("threadIdx.y"))
-    tx = s[B].op.reduce_axis[0]
+    schedule = te.create_schedule(result_c.op)
+    schedule[result_bb].compute_inline()
+    axis_bx, axis_ty = schedule[result_c].split(schedule[result_c].op.axis[0], factor=nthread)
+    _, axis_kf = schedule[result_b].split(axis_k, factor=nthread)
+    rfactor_bf = schedule.rfactor(result_b, axis_kf)
+    schedule[result_b].compute_at(schedule[result_c], axis_ty)
+    schedule[result_c].bind(axis_bx, te.thread_axis("blockIdx.x"))
+    schedule[result_c].bind(axis_ty, te.thread_axis("threadIdx.y"))
+    axis_tx = schedule[result_b].op.reduce_axis[0]
     thread_x = te.thread_axis("threadIdx.x")
-    s[B].bind(tx, thread_x)
-    s[BF].compute_at(s[B], tx)
+    schedule[result_b].bind(axis_tx, thread_x)
+    schedule[rfactor_bf].compute_at(schedule[result_b], axis_tx)
     # Since thread_x is shared across reductions
     # only one of them need to do write back
-    s[B].set_store_predicate(thread_x.var.equal(0))
-    s[C].set_store_predicate(thread_x.var.equal(0))
+    schedule[result_b].set_store_predicate(thread_x.var.equal(0))
+    schedule[result_c].set_store_predicate(thread_x.var.equal(0))
 
     # one line to build the function.
-    def check_target(device, host="stackvm"):
+    def check_target(device):
         dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("skip because %s is not enabled.." % device)
             return
-        fapi = tvm.lower(s, args=[A, C])
+        fapi = tvm.lower(schedule, args=[placeholder_a, result_c])
         fsum = tvm.build(fapi, target=device, name="mysum")
         # launch the kernel.
-        a = tvm.nd.array(np.random.uniform(size=(m, n)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(m, dtype=B.dtype), dev)
-        fsum(a, b)
-        res = np.sum(a.numpy(), axis=1) + 2
-        tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4)
+        buff_a = tvm.nd.array(
+            np.random.uniform(size=(num_m, num_n)).astype(placeholder_a.dtype), dev
+        )
+        buff_b = tvm.nd.array(np.zeros(num_m, dtype=result_b.dtype), dev)
+        fsum(buff_a, buff_b)
+        res = np.sum(buff_a.numpy(), axis=1) + 2
+        tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4)
 
     check_target("vulkan")
     check_target("cuda")
@@ -321,22 +374,26 @@ def check_target(device, host="stackvm"):
 
 
 def test_argmax():
-    def fcombine(x, y):
-        lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0])
-        rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1])
+    """Test argmax."""
+
+    def fcombine(tensor_x, tensor_y):
+        lhs = tvm.tir.Select((tensor_x[1] >= tensor_y[1]), tensor_x[0], tensor_y[0])
+        rhs = tvm.tir.Select((tensor_x[1] >= tensor_y[1]), tensor_x[1], tensor_y[1])
         return lhs, rhs
 
-    def fidentity(t0, t1):
-        return tvm.tir.const(-1, t0), tvm.te.min_value(t1)
+    def fidentity(tensor1, tensor2):
+        return tvm.tir.const(-1, tensor1), tvm.te.min_value(tensor2)
 
     argmax = te.comm_reducer(fcombine, fidentity, name="argmax")
-    m = te.size_var("m")
-    n = te.size_var("n")
-    idx = te.placeholder((m, n), name="idx", dtype="int32")
-    val = te.placeholder((m, n), name="val", dtype="float32")
-    k = te.reduce_axis((0, n), "k")
-    T0, T1 = te.compute((m,), lambda i: argmax((idx[i, k], val[i, k]), axis=k), name="T")
-    s = te.create_schedule(T0.op)
+    size_var_m = te.size_var("m")
+    size_var_n = te.size_var("n")
+    idx = te.placeholder((size_var_m, size_var_n), name="idx", dtype="int32")
+    val = te.placeholder((size_var_m, size_var_n), name="val", dtype="float32")
+    axis_k = te.reduce_axis((0, size_var_n), "k")
+    result_t0, result_t1 = te.compute(
+        (size_var_m,), lambda i: argmax((idx[i, axis_k], val[i, axis_k]), axis=axis_k), name="T"
+    )
+    schedule = te.create_schedule(result_t0.op)
 
     def check_target():
         device = "cpu"
@@ -344,19 +401,19 @@ def check_target():
             print("skip because %s is not enabled.." % device)
             return
         dev = tvm.device(device, 0)
-        fapi = tvm.lower(s, args=[idx, val, T0, T1])
+        fapi = tvm.lower(schedule, args=[idx, val, result_t0, result_t1])
         fargmax = tvm.build(fapi, target="llvm", name="argmax")
 
-        mm = 12
-        nn = 16
-        np_idx = np.repeat(np.arange(nn, dtype="int32").reshape(1, nn), mm, axis=0)
-        np_val = np.random.uniform(size=(mm, nn)).astype("float32")
+        height = 12
+        width = 16
+        np_idx = np.repeat(np.arange(width, dtype="int32").reshape(1, width), height, axis=0)
+        np_val = np.random.uniform(size=(height, width)).astype("float32")
         np_res = np.argmax(np_val, axis=1)
 
         nd_idx = tvm.nd.array(np_idx, dev)
         nd_val = tvm.nd.array(np_val, dev)
-        nd_res0 = tvm.nd.array(np.zeros(mm, dtype="int32"), dev)
-        nd_res1 = tvm.nd.array(np.zeros(mm, dtype="float32"), dev)
+        nd_res0 = tvm.nd.array(np.zeros(height, dtype="int32"), dev)
+        nd_res1 = tvm.nd.array(np.zeros(height, dtype="float32"), dev)
         fargmax(nd_idx, nd_val, nd_res0, nd_res1)
         tvm.testing.assert_allclose(np_res, nd_res0.numpy())
 
@@ -365,55 +422,63 @@ def check_target():
 
 @tvm.testing.requires_gpu
 def test_rfactor_argmax():
-    def fcombine(x, y):
-        lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0])
-        rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1])
+    """Test rfactor argmax"""
+
+    def fcombine(tensor0, tensor1):
+        lhs = tvm.tir.Select((tensor0[1] >= tensor1[1]), tensor0[0], tensor1[0])
+        rhs = tvm.tir.Select((tensor0[1] >= tensor1[1]), tensor0[1], tensor1[1])
         return lhs, rhs
 
-    def fidentity(t0, t1):
-        return tvm.tir.const(-1, t0), tvm.te.min_value(t1)
+    def fidentity(tensor0, tensor1):
+        return tvm.tir.const(-1, tensor0), tvm.te.min_value(tensor1)
 
     argmax = te.comm_reducer(fcombine, fidentity, name="argmax")
 
-    nn = 1027
-    mm = 10
-    n = tvm.runtime.convert(nn)
-    m = tvm.runtime.convert(mm)
-    A0 = te.placeholder((m, n), name="A0", dtype="int32")
-    A1 = te.placeholder((m, n), name="A1", dtype="float32")
-    k = te.reduce_axis((0, n))
-    B0, B1 = te.compute((m,), lambda i: argmax((A0[i, k], A1[i, k]), axis=k), name="B")
+    num_width = 1027
+    num_height = 10
+    width = tvm.runtime.convert(num_width)
+    height = tvm.runtime.convert(num_height)
+    placeholder_a0 = te.placeholder((height, width), name="A0", dtype="int32")
+    placeholder_a1 = te.placeholder((height, width), name="A1", dtype="float32")
+    axis_k = te.reduce_axis((0, width))
+    result_b0, result_b1 = te.compute(
+        (height,),
+        lambda i: argmax((placeholder_a0[i, axis_k], placeholder_a1[i, axis_k]), axis=axis_k),
+        name="B",
+    )
 
     # schedule
-    s = te.create_schedule(B0.op)
+    schedule = te.create_schedule(result_b0.op)
     nthread = 16
-    ko, kf = s[B0].split(k, factor=nthread)
-    BF0, BF1 = s.rfactor(B0, kf)
-    bx, ty = s[B0].split(s[B0].op.axis[0], factor=nthread)
-    s[B0].bind(bx, te.thread_axis("blockIdx.x"))
-    s[B0].bind(ty, te.thread_axis("threadIdx.y"))
-    tx = s[B0].op.reduce_axis[0]
+    _, axis_kf = schedule[result_b0].split(axis_k, factor=nthread)
+    rfactor_bf0, _ = schedule.rfactor(result_b0, axis_kf)
+    axis_bx, axis_ty = schedule[result_b0].split(schedule[result_b0].op.axis[0], factor=nthread)
+    schedule[result_b0].bind(axis_bx, te.thread_axis("blockIdx.x"))
+    schedule[result_b0].bind(axis_ty, te.thread_axis("threadIdx.y"))
+    axis_tx = schedule[result_b0].op.reduce_axis[0]
     thread_x = te.thread_axis("threadIdx.x")
-    s[B0].bind(tx, thread_x)
-    s[BF0.op].compute_at(s[B0], tx)
-    s[B0].set_store_predicate(thread_x.var.equal(0))
+    schedule[result_b0].bind(axis_tx, thread_x)
+    schedule[rfactor_bf0.op].compute_at(schedule[result_b0], axis_tx)
+    schedule[result_b0].set_store_predicate(thread_x.var.equal(0))
 
     def check_target(device):
         dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("skip because %s is not enabled.." % device)
             return
-        fapi = tvm.lower(s, args=[A0, A1, B0, B1])
+        fapi = tvm.lower(schedule, args=[placeholder_a0, placeholder_a1, result_b0, result_b1])
         fargmax = tvm.build(fapi, target=device, name="argmax")
 
-        np_idx = np.repeat(np.arange(nn, dtype="int32").reshape(1, nn), mm, axis=0)
-        np_val = np.random.uniform(size=(mm, nn)).astype("float32")
+        np_idx = np.repeat(
+            np.arange(num_width, dtype="int32").reshape(1, num_width), num_height, axis=0
+        )
+        np_val = np.random.uniform(size=(num_height, num_width)).astype("float32")
         np_res = np.argmax(np_val, axis=1)
 
         nd_idx = tvm.nd.array(np_idx, dev)
         nd_val = tvm.nd.array(np_val, dev)
-        nd_res0 = tvm.nd.array(np.zeros(mm, dtype="int32"), dev)
-        nd_res1 = tvm.nd.array(np.zeros(mm, dtype="float32"), dev)
+        nd_res0 = tvm.nd.array(np.zeros(num_height, dtype="int32"), dev)
+        nd_res1 = tvm.nd.array(np.zeros(num_height, dtype="float32"), dev)
         fargmax(nd_idx, nd_val, nd_res0, nd_res1)
         tvm.testing.assert_allclose(np_res, nd_res0.numpy())
 
@@ -424,6 +489,7 @@ def check_target(device):
 
 @tvm.testing.requires_gpu
 def test_warp_reduction1():
+    """Test warp reductions."""
     nthx = 32
     nthy = 4
     block_x = te.thread_axis("blockIdx.x")
@@ -437,30 +503,34 @@ def check_target(device, m, n):
             return
 
         # compute
-        A = te.placeholder((m, n), name="A")
-        k = te.reduce_axis((0, n))
-        B = te.compute((m,), lambda i: te.max(A[i][k], axis=k), name="B")
-        s = te.create_schedule(B.op)
+        placeholder_a = te.placeholder((m, n), name="A")
+        axis_k = te.reduce_axis((0, n))
+        placeholder_b = te.compute(
+            (m,), lambda i: te.max(placeholder_a[i][axis_k], axis=axis_k), name="B"
+        )
+        schedule = te.create_schedule(placeholder_b.op)
 
         # schedule
-        k = s[B].op.reduce_axis[0]
-        ko, _ = s[B].split(k, nparts=nthx)
-        s[B].bind(ko, thread_x)
-        xo, xi = s[B].split(s[B].op.axis[0], factor=nthy)
-        s[B].bind(xi, thread_y)
-        s[B].bind(xo, block_x)
+        axis_k = schedule[placeholder_b].op.reduce_axis[0]
+        axis_ko, _ = schedule[placeholder_b].split(axis_k, nparts=nthx)
+        schedule[placeholder_b].bind(axis_ko, thread_x)
+        axis_xo, axis_xi = schedule[placeholder_b].split(
+            schedule[placeholder_b].op.axis[0], factor=nthy
+        )
+        schedule[placeholder_b].bind(axis_xi, thread_y)
+        schedule[placeholder_b].bind(axis_xo, block_x)
 
-        tvm.lower(s, [A, B], simple_mode=True)
+        tvm.lower(schedule, [placeholder_a, placeholder_b], simple_mode=True)
 
         # validation
-        func = tvm.build(s, [A, B], device, name="warp_reduction")
-        a_np = np.random.uniform(size=(m, n)).astype(A.dtype)
-        b_np = np.zeros((m,), dtype=A.dtype)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(b_np, dev)
+        func = tvm.build(schedule, [placeholder_a, placeholder_b], device, name="warp_reduction")
+        a_np = np.random.uniform(size=(m, n)).astype(placeholder_a.dtype)
+        b_np = np.zeros((m,), dtype=placeholder_a.dtype)
+        buff_a = tvm.nd.array(a_np, dev)
+        buff_b = tvm.nd.array(b_np, dev)
         b_np = np.max(a_np, axis=1)
-        func(a, b)
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-3, atol=1e-3)
+        func(buff_a, buff_b)
+        tvm.testing.assert_allclose(buff_b.numpy(), b_np, rtol=1e-3, atol=1e-3)
 
     check_target("cuda", m=32, n=256)
     check_target("cuda", m=10, n=20)
@@ -472,21 +542,29 @@ def check_target(device, m, n):
 
 @tvm.testing.requires_gpu
 def test_warp_reduction2():
-    def fcombine(x, y):
-        return x[0] + y[0], x[1] * y[1]
+    """Test warp reductions."""
+
+    def fcombine(tensor1, tensor2):
+        return tensor1[0] + tensor2[0], tensor1[1] * tensor2[1]
 
-    def fidentity(t0, t1):
-        return tvm.tir.const(0, t0), tvm.tir.const(1, t1)
+    def fidentity(tensor1, tensor2):
+        return tvm.tir.const(0, tensor1), tvm.tir.const(1, tensor2)
 
     add_mul_reducer = te.comm_reducer(fcombine, fidentity, name="add_mul_reducer")
 
     # compute
-    m = 16
-    n = 256
-    A0 = te.placeholder((m, n), name="A0", dtype="float32")
-    A1 = te.placeholder((m, n), name="Al", dtype="float32")
-    k = te.reduce_axis((0, n), "k")
-    T0, T1 = te.compute((m,), lambda i: add_mul_reducer((A0[i, k], A1[i, k]), axis=k), name="T")
+    num_m = 16
+    num_n = 256
+    placeholder_a0 = te.placeholder((num_m, num_n), name="A0", dtype="float32")
+    placeholder_a1 = te.placeholder((num_m, num_n), name="Al", dtype="float32")
+    axis_k = te.reduce_axis((0, num_n), "k")
+    result0, result1 = te.compute(
+        (num_m,),
+        lambda i: add_mul_reducer(
+            (placeholder_a0[i, axis_k], placeholder_a1[i, axis_k]), axis=axis_k
+        ),
+        name="T",
+    )
 
     nthdx, nthdy = 32, 2
     block_x = te.thread_axis("blockIdx.x")
@@ -500,29 +578,31 @@ def check_target(device):
             return
 
         # schedule
-        s = te.create_schedule(T0.op)
-        ko, _ = s[T0].split(k, nparts=nthdx)
-        xo, xi = s[T0].split(s[T0].op.axis[0], factor=nthdy)
-        s[T0].bind(ko, thread_x)
-        s[T0].bind(xi, thread_y)
-        s[T0].bind(xo, block_x)
+        schedule = te.create_schedule(result0.op)
+        axis_ko, _ = schedule[result0].split(axis_k, nparts=nthdx)
+        axis_xo, axis_xi = schedule[result0].split(schedule[result0].op.axis[0], factor=nthdy)
+        schedule[result0].bind(axis_ko, thread_x)
+        schedule[result0].bind(axis_xi, thread_y)
+        schedule[result0].bind(axis_xo, block_x)
 
         # validation
         dev = tvm.device(device, 0)
-        a0_np = np.random.uniform(size=(m, n)).astype(A0.dtype)
-        a1_np = np.random.uniform(size=(m, n)).astype(A1.dtype)
-        t0_np = np.zeros((m,), dtype=A0.dtype)
-        t1_np = np.zeros((m,), dtype=A1.dtype)
-        a0 = tvm.nd.array(a0_np, dev)
-        a1 = tvm.nd.array(a1_np, dev)
-        t0 = tvm.nd.array(t0_np, dev)
-        t1 = tvm.nd.array(t1_np, dev)
-        func = tvm.build(s, [A0, A1, T0, T1], device, name="reduction")
-        func(a0, a1, t0, t1)
+        a0_np = np.random.uniform(size=(num_m, num_n)).astype(placeholder_a0.dtype)
+        a1_np = np.random.uniform(size=(num_m, num_n)).astype(placeholder_a1.dtype)
+        t0_np = np.zeros((num_m,), dtype=placeholder_a0.dtype)
+        t1_np = np.zeros((num_m,), dtype=placeholder_a1.dtype)
+        buff_a0 = tvm.nd.array(a0_np, dev)
+        buff_a1 = tvm.nd.array(a1_np, dev)
+        buff_t0 = tvm.nd.array(t0_np, dev)
+        buff_t1 = tvm.nd.array(t1_np, dev)
+        func = tvm.build(
+            schedule, [placeholder_a0, placeholder_a1, result0, result1], device, name="reduction"
+        )
+        func(buff_a0, buff_a1, buff_t0, buff_t1)
         t0_np = np.sum(a0_np, axis=1)
         t1_np = np.product(a1_np, axis=1)
-        tvm.testing.assert_allclose(t0.numpy(), t0_np, rtol=1e-3, atol=1e-3)
-        tvm.testing.assert_allclose(t1.numpy(), t1_np, rtol=1e-3, atol=1e-3)
+        tvm.testing.assert_allclose(buff_t0.numpy(), t0_np, rtol=1e-3, atol=1e-3)
+        tvm.testing.assert_allclose(buff_t1.numpy(), t1_np, rtol=1e-3, atol=1e-3)
 
     check_target("cuda")
     check_target("rocm")
@@ -530,6 +610,7 @@ def check_target(device):
 
 @tvm.testing.requires_cuda
 def test_reduce_storage_reuse():
+    """Test reduction reuses storage."""
     target = tvm.target.Target("cuda")
 
     def run_passes(sch, args):
@@ -547,13 +628,13 @@ def run_passes(sch, args):
     dev = tvm.device(target.kind.name, 0)
     shape = (16, 16)
 
-    A = te.placeholder(shape, dtype="float32", name="A")
-    B = topi.nn.softmax(A, axis=1) + 1.0
+    placeholder_a = te.placeholder(shape, dtype="float32", name="A")
+    placeholder_b = topi.nn.softmax(placeholder_a, axis=1) + 1.0
 
     with tvm.target.Target(target):
-        s = topi.cuda.schedule_softmax(B)
+        schedule = topi.cuda.schedule_softmax(placeholder_b)
 
-    mod = run_passes(s, [A, B])
+    mod = run_passes(schedule, [placeholder_a, placeholder_b])
 
     # Due to the storage rewrite pass, the reduction output storage reduce_temp0 can be reused as
     # the storage of the next compute.
@@ -586,12 +667,12 @@ def check_store_dst_remapped(op):
     inp = np.random.uniform(size=shape).astype("float32")
     ref = tvm.topi.testing.softmax_python(inp) + 1.0
 
-    f = tvm.build(s, [A, B], target)
-    a = tvm.nd.array(inp, dev)
-    b = tvm.nd.array(np.zeros(shape, dtype=B.dtype), dev)
-    f(a, b)
-    tvm.testing.assert_allclose(b.numpy(), ref, rtol=1e-5)
+    func = tvm.build(schedule, [placeholder_a, placeholder_b], target)
+    buff_a = tvm.nd.array(inp, dev)
+    buff_b = tvm.nd.array(np.zeros(shape, dtype=placeholder_b.dtype), dev)
+    func(buff_a, buff_b)
+    tvm.testing.assert_allclose(buff_b.numpy(), ref, rtol=1e-5)
 
 
 if __name__ == "__main__":
-    pytest.main([__pfile__])
+    pytest.main([__file__])
diff --git a/tests/python/integration/test_scan.py b/tests/python/integration/test_scan.py
index edeb862cd5fc..fa920e513502 100644
--- a/tests/python/integration/test_scan.py
+++ b/tests/python/integration/test_scan.py
@@ -14,38 +14,43 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import tvm
-from tvm import te
+"""Test scheduling adn running scan operators."""
 import numpy as np
+
+import tvm
 import tvm.testing
+from tvm import te
 
 
 @tvm.testing.requires_gpu
 def test_scan():
-    m = te.size_var("m")
-    n = te.size_var("n")
-    X = te.placeholder((m, n), name="X")
-    s_state = te.placeholder((m, n))
-    s_init = te.compute((1, n), lambda _, i: X[0, i])
-    s_update = te.compute((m, n), lambda t, i: s_state[t - 1, i] + X[t, i])
+    """Test scan operators."""
+    size_var_m = te.size_var("m")
+    size_var_n = te.size_var("n")
+    placeholder_x = te.placeholder((size_var_m, size_var_n), name="X")
+    s_state = te.placeholder((size_var_m, size_var_n))
+    s_init = te.compute((1, size_var_n), lambda _, i: placeholder_x[0, i])
+    s_update = te.compute(
+        (size_var_m, size_var_n), lambda t, i: s_state[t - 1, i] + placeholder_x[t, i]
+    )
     scan = tvm.te.scan(s_init, s_update, s_state)
     # test scan + compute case
-    res = te.compute((m, n), lambda i, j: scan[i, j])
+    res = te.compute((size_var_m, size_var_n), lambda i, j: scan[i, j])
 
     # schedule
-    s = te.create_schedule(res.op)
+    schedule = te.create_schedule(res.op)
     num_thread = 256
     block_x = te.thread_axis(None, "blockIdx.x")
     thread_x = te.thread_axis((0, num_thread), "threadIdx.x")
-    xo, xi = s[s_init].split(s_init.op.axis[1], factor=num_thread)
-    s[s_init].bind(xo, block_x)
-    s[s_init].bind(xi, thread_x)
-    xo, xi = s[s_update].split(s_update.op.axis[1], factor=num_thread)
-    s[s_update].bind(xo, block_x)
-    s[s_update].bind(xi, thread_x)
-    xo, xi = s[res].split(res.op.axis[1], factor=num_thread)
-    s[res].bind(xo, block_x)
-    s[res].bind(xi, thread_x)
+    axis_xo, axis_xi = schedule[s_init].split(s_init.op.axis[1], factor=num_thread)
+    schedule[s_init].bind(axis_xo, block_x)
+    schedule[s_init].bind(axis_xi, thread_x)
+    axis_xo, axis_xi = schedule[s_update].split(s_update.op.axis[1], factor=num_thread)
+    schedule[s_update].bind(axis_xo, block_x)
+    schedule[s_update].bind(axis_xi, thread_x)
+    axis_xo, axis_xi = schedule[res].split(res.op.axis[1], factor=num_thread)
+    schedule[res].bind(axis_xo, block_x)
+    schedule[res].bind(axis_xi, thread_x)
 
     # one line to build the function.
     def check_device(device):
@@ -53,15 +58,15 @@ def check_device(device):
         if not tvm.testing.device_enabled(device):
             print("skip because %s is not enabled.." % device)
             return
-        fscan = tvm.build(s, [X, res], device, name="myscan")
+        fscan = tvm.build(schedule, [placeholder_x, res], device, name="myscan")
         # launch the kernel.
-        n = 1024
-        m = 10
-        a_np = np.random.uniform(size=(m, n)).astype(res.dtype)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(np.zeros((m, n), dtype=res.dtype), dev)
-        fscan(a, b)
-        tvm.testing.assert_allclose(b.numpy(), np.cumsum(a_np, axis=0))
+        num_n = 1024
+        num_m = 10
+        a_np = np.random.uniform(size=(num_m, num_n)).astype(res.dtype)
+        buff_a = tvm.nd.array(a_np, dev)
+        buff_b = tvm.nd.array(np.zeros((num_m, num_n), dtype=res.dtype), dev)
+        fscan(buff_a, buff_b)
+        tvm.testing.assert_allclose(buff_b.numpy(), np.cumsum(a_np, axis=0))
 
     check_device("vulkan")
     check_device("cuda")
diff --git a/tests/python/integration/test_tuning.py b/tests/python/integration/test_tuning.py
index 963609ea5901..04c5f85ce5d4 100644
--- a/tests/python/integration/test_tuning.py
+++ b/tests/python/integration/test_tuning.py
@@ -19,11 +19,8 @@
 """
 import logging
 import multiprocessing as mp
-import sys
 import textwrap
-import time
 
-import pytest
 import tvm
 import tvm.relay
 import tvm.testing
@@ -34,100 +31,138 @@
 from tvm.ir.instrument import pass_instrument
 from tvm.ir.transform import PassContext
 from tvm.target import Target
+from tvm.tir.analysis import _ffi_api as _analysis_ffi_api
 
 
 def setup_module():
+    """Setup the module used for testing."""
+
     @autotvm.template("testing/conv2d_no_batching")
-    def conv2d_no_batching(N, H, W, CI, CO, KH, KW):
+    def conv2d_no_batching(  # pylint: disable=unused-variable
+        batch_size, input_h, input_w, channels_in, channels_out, kernel_h, kernel_w
+    ):
         """An example template for testing"""
-        assert N == 1, "Only consider batch_size = 1 in this template"
+        assert batch_size == 1, "Only consider batch_size = 1 in this template"
 
-        data = te.placeholder((N, CI, H, W), name="data")
-        kernel = te.placeholder((CO, CI, KH, KW), name="kernel")
+        data = te.placeholder((batch_size, channels_in, input_h, input_w), name="data")
+        kernel = te.placeholder((channels_out, channels_in, kernel_h, kernel_w), name="kernel")
 
-        rc = te.reduce_axis((0, CI), name="rc")
-        ry = te.reduce_axis((0, KH), name="ry")
-        rx = te.reduce_axis((0, KW), name="rx")
+        axis_rc = te.reduce_axis((0, channels_in), name="rc")
+        axis_ry = te.reduce_axis((0, kernel_h), name="ry")
+        axis_rx = te.reduce_axis((0, kernel_w), name="rx")
 
         conv = te.compute(
-            (N, CO, H - KH + 1, W - KW + 1),
+            (batch_size, channels_out, input_h - kernel_h + 1, input_w - kernel_w + 1),
             lambda nn, ff, yy, xx: te.sum(
-                data[nn, rc, yy + ry, xx + rx] * kernel[ff, rc, ry, rx], axis=[rc, ry, rx]
+                data[nn, axis_rc, yy + axis_ry, xx + axis_rx]
+                * kernel[ff, axis_rc, axis_ry, axis_rx],
+                axis=[axis_rc, axis_ry, axis_rx],
             ),
             tag="conv2d_nchw",
         )
 
-        s = te.create_schedule([conv.op])
+        schedule = te.create_schedule([conv.op])
 
         output = conv
-        OL = s.cache_write(conv, "local")
+        cache_write_ol = schedule.cache_write(conv, "local")
 
         # create cache stage
-        AA = s.cache_read(data, "shared", [OL])
-        WW = s.cache_read(kernel, "shared", [OL])
-        AL = s.cache_read(AA, "local", [OL])
-        WL = s.cache_read(WW, "local", [OL])
+        cache_read_aa = schedule.cache_read(data, "shared", [cache_write_ol])
+        cache_read_ww = schedule.cache_read(kernel, "shared", [cache_write_ol])
+        cache_read_al = schedule.cache_read(cache_read_aa, "local", [cache_write_ol])
+        cache_read_wl = schedule.cache_read(cache_read_ww, "local", [cache_write_ol])
 
         # tile and bind spatial axes
-        n, f, y, x = s[output].op.axis
+        axis_n, axis_f, axis_y, axis_x = schedule[output].op.axis
         cfg = autotvm.get_config()
-        cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
-        cfg.define_split("tile_y", cfg.axis(y), num_outputs=4)
-        cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
-        bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
-        by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
-        bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
-        kernel_scope = n  # this is the scope to attach global config inside this kernel
-
-        s[output].bind(bf, te.thread_axis("blockIdx.z"))
-        s[output].bind(by, te.thread_axis("blockIdx.y"))
-        s[output].bind(bx, te.thread_axis("blockIdx.x"))
-        s[output].bind(vf, te.thread_axis("vthread"))
-        s[output].bind(vy, te.thread_axis("vthread"))
-        s[output].bind(vx, te.thread_axis("vthread"))
-        s[output].bind(tf, te.thread_axis("threadIdx.z"))
-        s[output].bind(ty, te.thread_axis("threadIdx.y"))
-        s[output].bind(tx, te.thread_axis("threadIdx.x"))
-        s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
-        s[OL].compute_at(s[output], tx)
+        cfg.define_split("tile_f", cfg.axis(axis_f), num_outputs=4)
+        cfg.define_split("tile_y", cfg.axis(axis_y), num_outputs=4)
+        cfg.define_split("tile_x", cfg.axis(axis_x), num_outputs=4)
+        axis_bf, axis_vf, axis_tf, axis_fi = cfg["tile_f"].apply(schedule, output, axis_f)
+        axis_by, axis_vy, axis_ty, axis_yi = cfg["tile_y"].apply(schedule, output, axis_y)
+        axis_bx, axis_vx, axis_tx, axis_xi = cfg["tile_x"].apply(schedule, output, axis_x)
+        kernel_scope = axis_n  # this is the scope to attach global config inside this kernel
+
+        schedule[output].bind(axis_bf, te.thread_axis("blockIdx.z"))
+        schedule[output].bind(axis_by, te.thread_axis("blockIdx.y"))
+        schedule[output].bind(axis_bx, te.thread_axis("blockIdx.x"))
+        schedule[output].bind(axis_vf, te.thread_axis("vthread"))
+        schedule[output].bind(axis_vy, te.thread_axis("vthread"))
+        schedule[output].bind(axis_vx, te.thread_axis("vthread"))
+        schedule[output].bind(axis_tf, te.thread_axis("threadIdx.z"))
+        schedule[output].bind(axis_ty, te.thread_axis("threadIdx.y"))
+        schedule[output].bind(axis_tx, te.thread_axis("threadIdx.x"))
+        schedule[output].reorder(
+            axis_n,
+            axis_bf,
+            axis_by,
+            axis_bx,
+            axis_vf,
+            axis_vy,
+            axis_vx,
+            axis_tf,
+            axis_ty,
+            axis_tx,
+            axis_fi,
+            axis_yi,
+            axis_xi,
+        )
+        schedule[cache_write_ol].compute_at(schedule[output], axis_tx)
 
         # tile and bind reduction axes
-        n, f, y, x = s[OL].op.axis
-        rc, ry, rx = s[OL].op.reduce_axis
-        cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3)
-        cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=3)
-        cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=3)
-        rco, rcm, rci = cfg["tile_rc"].apply(s, OL, rc)
-        ryo, rym, ryi = cfg["tile_rx"].apply(s, OL, ry)
-        rxo, rxm, rxi = cfg["tile_ry"].apply(s, OL, rx)
-        s[OL].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, n, f, y, x)
-
-        s[AA].compute_at(s[OL], rxo)
-        s[WW].compute_at(s[OL], rxo)
-        s[AL].compute_at(s[OL], rxm)
-        s[WL].compute_at(s[OL], rxm)
+        axis_n, axis_f, axis_y, axis_x = schedule[cache_write_ol].op.axis
+        axis_rc, axis_ry, axis_rx = schedule[cache_write_ol].op.reduce_axis
+        cfg.define_split("tile_rc", cfg.axis(axis_rc), num_outputs=3)
+        cfg.define_split("tile_ry", cfg.axis(axis_ry), num_outputs=3)
+        cfg.define_split("tile_rx", cfg.axis(axis_rx), num_outputs=3)
+        axis_rco, axis_rcm, axis_rci = cfg["tile_rc"].apply(schedule, cache_write_ol, axis_rc)
+        axis_ryo, axis_rym, axis_ryi = cfg["tile_rx"].apply(schedule, cache_write_ol, axis_ry)
+        axis_rxo, axis_rxm, axis_rxi = cfg["tile_ry"].apply(schedule, cache_write_ol, axis_rx)
+        schedule[cache_write_ol].reorder(
+            axis_rco,
+            axis_ryo,
+            axis_rxo,
+            axis_rcm,
+            axis_rym,
+            axis_rxm,
+            axis_rci,
+            axis_ryi,
+            axis_rxi,
+            axis_n,
+            axis_f,
+            axis_y,
+            axis_x,
+        )
+
+        schedule[cache_read_aa].compute_at(schedule[cache_write_ol], axis_rxo)
+        schedule[cache_read_ww].compute_at(schedule[cache_write_ol], axis_rxo)
+        schedule[cache_read_al].compute_at(schedule[cache_write_ol], axis_rxm)
+        schedule[cache_read_wl].compute_at(schedule[cache_write_ol], axis_rxm)
 
         # cooperative fetching
-        for load in [AA, WW]:
-            n, f, y, x = s[load].op.axis
-            fused = s[load].fuse(n, f, y, x)
-            tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
-            ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
-            tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
-            s[load].bind(tz, te.thread_axis("threadIdx.z"))
-            s[load].bind(ty, te.thread_axis("threadIdx.y"))
-            s[load].bind(tx, te.thread_axis("threadIdx.x"))
+        for load in [cache_read_aa, cache_read_ww]:
+            axis_n, axis_f, axis_y, axis_x = schedule[load].op.axis
+            fused = schedule[load].fuse(axis_n, axis_f, axis_y, axis_x)
+            axis_tz, fused = schedule[load].split(fused, nparts=cfg["tile_f"].size[2])
+            axis_ty, fused = schedule[load].split(fused, nparts=cfg["tile_y"].size[2])
+            axis_tx, fused = schedule[load].split(fused, nparts=cfg["tile_x"].size[2])
+            schedule[load].bind(axis_tz, te.thread_axis("threadIdx.z"))
+            schedule[load].bind(axis_ty, te.thread_axis("threadIdx.y"))
+            schedule[load].bind(axis_tx, te.thread_axis("threadIdx.x"))
 
         # tune unroll
         cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
         cfg.define_knob("unroll_explicit", [0, 1])
-        s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-        s[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
+        schedule[output].pragma(
+            kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val
+        )
+        schedule[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
 
-        return s, [data, kernel, conv]
+        return schedule, [data, kernel, conv]
 
 
 def teardown_module():
+    """Remove the module from the autotvm task tables."""
     # TODO(areusch): Tasks should not be registered into a global.
     del autotvm.task.task.TASK_TABLE["testing/conv2d_no_batching"]
 
@@ -158,8 +193,10 @@ def run_test_with_all_multiprocessing(func, *args, **kwargs):
 
 
 @tvm.testing.parametrize_targets("cuda", "opencl")
-def test_tuning_gpu(target, dev):
-    def runner(target, dev):
+def test_tuning_gpu(target):
+    """Test gpu tuning."""
+
+    def runner(target):
         # init task
         task, target = get_sample_task(target, None)
         logging.info("task config space: %s", task.config_space)
@@ -181,22 +218,21 @@ def runner(target, dev):
             r
             for r in results
             if r.error_no == autotvm.MeasureErrorNo.NO_ERROR
-            # Autotvm can filter some records before building if we know they won't work ahead of time.
-            # We can't guarantee we sample at least one good record so we count these as success too
+            # We filter records before building if we know they won't work ahead of time.
+            # We can't guarantee we get one good record so we count these as success too
             or r.error_no == autotvm.MeasureErrorNo.INSTANTIATION_ERROR
         ]
         assert len(successful_results) > 0, f"No successful tuning runs: {results!r}"
 
-    run_test_with_all_multiprocessing(runner, target, dev)
+    run_test_with_all_multiprocessing(runner, target)
 
 
 @tvm.testing.parametrize_targets("cuda", "opencl")
-def test_tuning_gpu_inherits_pass_context(target, dev):
+def test_tuning_gpu_inherits_pass_context(target):
     """Autotvm tuner inherits PassContexts but also adds a gpu verification pass by default.
 
     Test that using PassContext inherits passes properly but also runs gpu verification pass.
     """
-    from tvm.tir.analysis import _ffi_api as _analysis_ffi_api
 
     @pass_instrument
     class PassInstrumentChecker:
@@ -205,7 +241,7 @@ class PassInstrumentChecker:
         def __init__(self):
             self.has_been_run = False
 
-        def run_after_pass(self, mod, info):
+        def run_after_pass(self, *_):
             self.has_been_run = True
 
     class GPUVerifyPassMocked:
@@ -274,10 +310,12 @@ def __init__(
             do_fork=False,
             runtime=None,
         ):
+            # pylint: disable=too-many-function-args
             super().__init__(timeout, n_parallel, build_kwargs, build_func, do_fork, runtime)
+
             self.build_func = OverwrittenBuildFunc(tar.tar, runtime)
 
-    def runner(target, dev):
+    def runner(target):
         task, target = get_sample_task(target, None)
         logging.info("task config space: %s", task.config_space)
 
@@ -295,10 +333,12 @@ def runner(target, dev):
 
         assert len(results) == 1
 
-    run_test_with_all_multiprocessing(runner, target, dev)
+    run_test_with_all_multiprocessing(runner, target)
 
 
 def test_tuning_cpu():
+    """Test tuning on cpu."""
+
     def runner():
         ir_mod = tvm.parser.fromtext(
             textwrap.dedent(
diff --git a/tests/python/integration/test_winograd_nnpack.py b/tests/python/integration/test_winograd_nnpack.py
index 71091f69d964..b088b350c9f0 100644
--- a/tests/python/integration/test_winograd_nnpack.py
+++ b/tests/python/integration/test_winograd_nnpack.py
@@ -14,18 +14,18 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Test winograd convolution using nnpack impl."""
 import numpy as np
+from pytest import skip
+
 import tvm
-from tvm import te
-from tvm import autotvm
+import tvm.testing
+import tvm.topi.testing
+from tvm import autotvm, te, topi
 from tvm.autotvm.task.space import FallbackConfigEntity
 from tvm.contrib import nnpack
 from tvm.contrib.pickle_memoize import memoize
-from tvm import topi
-import tvm.topi.testing
 from tvm.topi.utils import get_const_tuple
-from pytest import skip
-import tvm.testing
 
 
 def verify_conv2d_nchw(
@@ -36,11 +36,12 @@ def verify_conv2d_nchw(
     kernel,
     stride,
     padding,
+    devices,
     dilation=1,
     add_bias=False,
     add_relu=False,
-    devices=["cuda", "llvm -device=arm_cpu", "opencl -device=mali"],
 ):
+    """Verify conv2d nchw workload."""
     print(
         "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
         % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)
@@ -48,14 +49,14 @@ def verify_conv2d_nchw(
 
     in_height = in_width = in_size
 
-    A = te.placeholder((batch, in_channel, in_height, in_width), name="A")
-    W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W")
+    placholder_a = te.placeholder((batch, in_channel, in_height, in_width), name="A")
+    placeholder_w = te.placeholder((num_filter, in_channel, kernel, kernel), name="W")
     bias = te.placeholder((num_filter, 1, 1), name="bias")
 
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
+    a_shape = get_const_tuple(placholder_a.shape)
+    w_shape = get_const_tuple(placeholder_w.shape)
     bias_shape = get_const_tuple(bias.shape)
-    dtype = A.dtype
+    dtype = placholder_a.dtype
 
     @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw")
     def get_ref_data():
@@ -79,42 +80,52 @@ def check_device(device):
             print("Skipping %s becuase it is not enabled" % device)
         print("Running on target: %s" % device)
         with tvm.target.Target(device):
-            C = topi.nn.conv2d(A, W, stride, padding, dilation, layout="NCHW", out_dtype=dtype)
+            result_c = topi.nn.conv2d(
+                placholder_a,
+                placeholder_w,
+                stride,
+                padding,
+                dilation,
+                layout="NCHW",
+                out_dtype=dtype,
+            )
             if add_bias:
-                C = topi.add(C, bias)
+                result_c = topi.add(result_c, bias)
             if add_relu:
-                C = topi.nn.relu(C)
-            s = topi.generic.schedule_conv2d_nchw([C])
+                result_c = topi.nn.relu(result_c)
+            schedule = topi.generic.schedule_conv2d_nchw([result_c])
 
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
+        buff_a = tvm.nd.array(a_np, dev)
+        buff_w = tvm.nd.array(w_np, dev)
+        buff_b = tvm.nd.array(b_np, dev)
+        buff_c = tvm.nd.array(np.zeros(get_const_tuple(result_c.shape), dtype=result_c.dtype), dev)
         if add_bias:
             func = tvm.build(
-                s,
-                [A, W, bias, C],
+                schedule,
+                [placholder_a, placeholder_w, bias, result_c],
                 device,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation),
             )
-            func(a, w, b, c)
+            func(buff_a, buff_w, buff_b, buff_c)
         else:
             func = tvm.build(
-                s,
-                [A, W, C],
+                schedule,
+                [placholder_a, placeholder_w, result_c],
                 device,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation),
             )
-            func(a, w, c)
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-4)
+            func(buff_a, buff_w, buff_c)
+        tvm.testing.assert_allclose(buff_c.numpy(), c_np, rtol=1e-4)
 
     for device in devices:
         check_device(device)
 
 
 class WinogradFallback(autotvm.FallbackContext):
+    """Winograd fallbacks."""
+
     def _query_inside(self, target, workload):
         key = (target, workload)
         if key in self.memory:
@@ -126,6 +137,8 @@ def _query_inside(self, target, workload):
 
 
 def test_conv2d_nchw():
+    """Verify conv2d nchw winograd works."""
+
     if not tvm.get_global_func(
         "tvm.contrib.nnpack.convolution_inference_without_weight_transform", True
     ):