From 1df07f7b0241a08d6d463f592078378bb8c7cff3 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Wed, 1 Mar 2023 15:11:54 -0800
Subject: [PATCH 01/40] initial basis

---
 include/tvm/topi/transform.h             |  10 +-
 python/test_2.py                         |  71 ++++++
 python/tvm/relay/op/_transform.py        |   2 +-
 python/tvm/relay/op/strategy/cuda.py     | 261 +++++++++++++++++++++++
 python/tvm/relay/op/strategy/generic.py  |  36 +++-
 python/tvm/topi/cuda/batch_matmul.py     |  11 +-
 python/tvm/topi/transform.py             |  17 +-
 python/tvm/topi/x86/new.py               |  12 ++
 src/tir/schedule/primitive/compute_at.cc |   5 +
 src/topi/transform.cc                    |   2 +-
 10 files changed, 406 insertions(+), 21 deletions(-)
 create mode 100644 python/test_2.py
 create mode 100644 python/tvm/topi/x86/new.py
diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index dff6374a6185..51058485838f 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -1596,8 +1596,10 @@ inline Array<Tensor> meshgrid(const Array<Tensor>& inputs, const std::string& in
  */
 inline Tensor layout_transform(const Tensor& src, const std::string& src_layout,
                                const std::string& dst_layout,
+                               const std::string schedule_rule = "None",
                                const std::string name = "T_layout_trans",
                                const std::string tag = kInjective) {
+  // LOG(FATAL) << "WHATTT";
   Layout src_layout_struct(src_layout);
   Layout dst_layout_struct(dst_layout);
 
@@ -1614,6 +1616,12 @@ inline Tensor layout_transform(const Tensor& src, const std::string& src_layout,
 
   Array<PrimExpr> dst_shape = layout_converter.ForwardShape(src->shape);
 
+  Map<String, ObjectRef> attrs = {
+    {"schedule_rule", String(schedule_rule)}, 
+    {"src_layout", String(src_layout)},
+    {"dst_layout", String(dst_layout)}
+  };
+
   return compute(
       dst_shape,
       [&](const Array<Var>& dst_indices) {
@@ -1625,7 +1633,7 @@ inline Tensor layout_transform(const Tensor& src, const std::string& src_layout,
         }
         return if_then_else(in_range, src(src_indices), tvm::cast(src->dtype, PrimExpr(0)));
       },
-      name, tag);
+      name, tag, attrs);
 }
 
 /*! \brief Utility function for auto_scheduler_layout_transform */
diff --git a/python/test_2.py b/python/test_2.py
new file mode 100644
index 000000000000..e0ddc33d1586
--- /dev/null
+++ b/python/test_2.py
@@ -0,0 +1,71 @@
+import onnx
+
+import tvm.relay
+from tvm import meta_schedule as ms
+
+mod, params = tvm.relay.frontend.from_onnx(
+    onnx.load("resnet50-v1-12-int8.onnx"), shape={"data": [1, 3, 224, 224]}, freeze_params=True
+)
+
+
+def apply_relay_passes(
+    mod: tvm.IRModule,
+):
+    """Applies relay passes to the input IRModule.
+
+    :param mod: The input IRModule
+    :return: The IRModule after all the relays passes have been applied
+    """
+    # N.B. Defer the import so as not to unconditionally require other runtimes.
+    from tvm import relay, transform
+    from tvm.relay.op.contrib.dnnl import rewrite_layer_norm
+
+    mod = rewrite_layer_norm(mod)
+
+    passes = []
+
+    # If the inputs are static, run DynamicToStatic to remove
+    # any residual dynamism in the model.
+    # If the inputs are dynamic, this pass is much more expensive
+    # and will not remove dynamism from the model, so we skip it.
+    passes.append(relay.transform.DynamicToStatic())
+
+    # Infer types prior to the quantization pass below as some
+    # transforms might need them.
+    passes.append(relay.transform.InferType())
+
+    # Transform fake quantized sub-graphs to actual integer ops.
+    # Should have no effect on graphs without the relevant patterns.
+    passes.append(relay.transform.FakeQuantizationToInteger())
+
+    passes.append(relay.transform.FastMath())
+
+    # Fold constants after FQ2I becuase some weights are stored in FP32.
+    passes.append(relay.transform.FoldConstant())
+
+    # Use sequential to solve for dependent passes
+    seq = transform.Sequential(passes)
+
+    with tvm.transform.PassContext(opt_level=4):
+        mod = seq(mod)
+
+    mod = relay.transform.InferType()(mod)
+    # mod["main"] = rewrite(relay.qnn.transform.LayerNormQuantizedRewrite(), mod["main"])
+
+    return mod
+
+
+mod = apply_relay_passes(mod)
+
+print(mod)
+target = tvm.target.Target("nvidia/geforce-rtx-3070")
+work_dir = "resnet_tune"
+
+db = ms.relay_integration.tune_relay(mod, params, target, work_dir, 1000)
+
+"""
+Works:
+Does not:
+57de9e7f3d2711582368903ce95f08b91216b7b5    Mon Nov 28 21:28:37 2022 -0800
+
+"""
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 4499d5e2266e..93df67ff6b99 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -94,7 +94,7 @@ def compute_strided_set(attrs, inputs, output_type):
 _reg.register_injective_schedule("strided_set")
 
 # layout_transform
-_reg.register_injective_schedule("layout_transform")
+_reg.register_strategy("layout_transform", strategy.layout_transform_strategy)
 _reg.register_pattern("layout_transform", OpPattern.INJECTIVE)
 _reg.register_injective_schedule("auto_scheduler_layout_transform")
 _reg.register_pattern("auto_scheduler_layout_transform", OpPattern.INJECTIVE)
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 416637c14905..404b5eb424a1 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -1396,3 +1396,264 @@ def dft_strategy_cuda(attrs, inputs, out_type, target):
         name="dft.cuda",
     )
     return strategy
+
+@layout_transform_strategy.register(["cuda", "gpu"])
+def layout_transform_strategy(attrs, inputs, out_type, target):
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_layout_transform(topi.layout_transform, schedule_rule="layout_transform"),
+        wrap_topi_schedule(topi.cuda.schedule_injective),
+        name="layout_transform.cuda",
+    )
+    return strategy
+
+
+# "meta_schedule.cuda.layout_transform"
+import tvm
+
+@tvm.register_func("meta_schedule.cuda.layout_transform")
+def cuda_layout_transform_schedule_rule(sch, block):
+    # params: input_buffer, output_buffer
+    params = sch.mod["main"].params
+    input_buffer = sch.mod["main"].buffer_map[params[0]]
+    output_buffer = sch.mod["main"].buffer_map[params[1]]
+
+    input_shape = [int(dim) for dim in input_buffer.shape]
+    output_shape = [int(dim) for dim in output_buffer.shape]
+
+    src_layout = sch.get_sref(block).stmt.annotations["src_layout"]
+    dst_layout = sch.get_sref(block).stmt.annotations["dst_layout"]
+
+    import math
+    from typing import List, Sequence, Tuple
+
+    from tvm.tir.schedule import BlockRV, ExprRV, LoopRV
+
+    def schedule_layout_transform_v4(
+        sch: tvm.tir.Schedule,
+        src_layout: str,
+        dst_layout: str,
+        input_shape: List[int],
+        tile_size: ExprRV,
+    ):
+        ## Tiling block_read
+        # Let N and M represent the dimensions of interest
+        # N and M are the last dim of src_layout and dst_layout respectively.
+        # Then the initial block's loop will look like
+        # [i1, i2 ... M ... j1, j2 ... N]
+        #
+        # To guarantee contiguous read for N, we must group reads
+        # so that loops which factor N, j_n, j_n-1 ... are the innermost dimension
+        # Therefore our strategy for guaranteeing contiguous writes for N is by
+        # continually splitting inner-most dimension in order of N, j_n, j_n-1...
+        # by factors which divide into tile_size until we have the final tile_size.
+        # e.g. if tile_size = 32. N = 2, j_n = 2, j_n-1 = 4, j_n-2 = 24
+        # Then by combining N...j_n-1 we get a factor of 16, by spliiting up j_n-2 into
+        # two loops of 2 and 12, we can combine with the new loop of factor 2 to get a
+        # factor of 32. Note things don't divide evenly often time so we may have to pad
+        # to properly factorize.
+        #
+        # Similarly with M, to have contiguous writes we must consider the dst_layout:
+        # [a1, a2 ... N ... b1, b2 ... M]
+        # So that loops which factor M, b_m, b_m-1 ... are the innermost dimension
+        # Note that the dimension b_m, b_m-1 and j_n, j_n-1 may refer to the same dimension!
+        # By factoring j_n, j_n-1 we may be contributing to the innermost write dimension for
+        # M. However we note that both reads and writes must ideally have the same amount of
+        # work per thread for layout transforms so we still must build out to factors up
+        # to tile_size.
+
+        def pad_dimension_to_at_least_number(loop: LoopRV, requested_size: int):
+            """E.g. if loop has extant of 8 but we want 10, returns size 10 loop with padding"""
+            l1, l2 = sch.split(loop, [None, requested_size])
+            return sch.fuse(l1, l2)
+
+        def pad_dimension_to_factor_of_tile_size(
+            loop: LoopRV, initial_size: int, tile_size: int = tile_size
+        ) -> Tuple[LoopRV, int]:
+            """
+            Pads loop of given size until it is divisble into tile_size.
+            If the given size of the loop is greater than tile size. Do not pad.
+
+            example, loop_size = 5, tile_size = 32. loop_size --> 8
+                    loop_size = 5, tile_size = 36. loop_size --> 6
+                    loop_size = 8, tile_size = 32. loop_size --> 8
+                    loop_size = 33, tile_size = 32. loop_size --> 33
+
+            Returns padded loopRV and the new size
+            """
+            if tile_size % initial_size == 0:
+                return loop, int(initial_size)
+
+            if initial_size > tile_size or initial_size == tile_size:
+                return loop, int(initial_size)
+
+            # if initial_size > tile_size return without change, factor = 1
+            size = initial_size
+            while (tile_size % size) % tile_size > 0:
+                size += 1
+
+            return pad_dimension_to_at_least_number(loop, size), int(size)
+
+        def spin_out_factor(
+            loops: List[LoopRV], loop_extants: List[int], index: int, factor_needed: int
+        ) -> Tuple[List[LoopRV], List[int], int]:
+            """
+            Factor out loop extant to reach the requested factor. Updates the schedule in-place.
+
+            E.g. say we want to factors which eventually multiply to 32 (factor_needed).
+
+            Say we have the index we chose is a loop with an extant of 8.
+            E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed = 32, index = 3
+                - 8 divides into 32 so we just split up the loop into two loops with extants 1 and 8.
+                - we then keep the 1-loop in place and move the new 8-loop to back of the list of loops
+                - ending loops / loop_extants = [3, 32, 6, 1, 8], remaining_factor_needed = 32 / 8 = 4
+
+            E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed=32, index = 0
+                - 3 does not divide 32, so we pad until the extant divides 32, e.g. 4
+                - we then split up the loop into extants 1 and 4, moving the 4 to the back
+                - ending loops / loop_extants = [1, 32, 6, 8, 4], remaining_factor_needed = 32 / 4 = 8
+
+            E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed=5, index = 3
+                - 8 is larger than 5 so we immediately do the splitting routine.
+                - the 8 extant loop becomes loops with extants 2 and 5
+                - ending loops / loop_extants = [1, 32, 6, 2, 5], remaining_factor_needed = 5 / 5 = 1
+
+            After updating loop ordering in place, returns the new list of loops, extants, and the
+            remaining factor needed.
+            """
+            cur_loop = loops[index]
+            cur_extant = loop_extants[index]
+
+            # Pad loops to divide evenly for factors needed, and split
+            new_loop, new_size = pad_dimension_to_factor_of_tile_size(
+                cur_loop, cur_extant, tile_size=factor_needed
+            )
+
+            split_factor = min(new_size, factor_needed)
+            new_loop_split, factored_loop = sch.split(new_loop, [None, split_factor])
+            factor_needed = factor_needed // split_factor
+
+            # update caching
+            loops[index] = new_loop_split
+            loops.append(factored_loop)
+
+            loop_extants[index] = math.ceil(new_size / split_factor)
+            loop_extants.append(split_factor)
+
+            sch.reorder(*loops)
+            return loops, loop_extants, factor_needed
+
+        def factor_dim_in_order(
+            indices: Sequence[int],
+            loops: List[LoopRV],
+            cur_loop_extants: List[int],
+            work_needed_inner_loop: int = tile_size,
+        ):
+            """TODO"""
+            for i in indices:
+                if work_needed_inner_loop == 1:
+                    break
+                loops, cur_loop_extants, work_needed_inner_loop = spin_out_factor(
+                    loops, cur_loop_extants, i, work_needed_inner_loop
+                )
+            return loops, cur_loop_extants
+
+        def get_high_level_loop_structure(block):
+            """Runs the factorization described above."""
+            # index 0 ... rank - 1 will always correspond to original loops
+            # perhaps after they have been factored.
+            loops = sch.get_loops(block)
+            cur_loop_extants = list(input_shape)
+
+            # Factor dim0 tile size and fuse things together
+            loops, cur_loop_extants = factor_dim_in_order(
+                range(rank - 1, -1, -1),
+                loops,
+                cur_loop_extants,
+                work_needed_inner_loop=tile_size,
+            )
+            # The factors which multiply to tile_size are now in back of our
+            # list of loops. However because we added them by traversing the inner
+            # dimensions, they are actually reversed order to guarantee the best access
+            # so reorder so reorder before fusing.
+            loops = loops[:rank] + loops[rank:][::-1]
+            cur_loop_extants = cur_loop_extants[:rank] + cur_loop_extants[rank::-1]
+            sch.reorder(*loops)
+            dim0_loop_tiled = sch.fuse(*loops[rank:])
+            loops = loops[:rank]
+            loops.append(dim0_loop_tiled)
+            cur_loop_extants = cur_loop_extants[:rank]
+            cur_loop_extants.append(tile_size)
+
+            # Same thing with dim1
+            # [:rank + 1], since we placed dim0_loop_tiled in the end which we want to keep
+            loops, cur_loop_extants = factor_dim_in_order(
+                (
+                    src_layout.index(dst_layout[loop_index_dst])
+                    for loop_index_dst in range(rank - 1, -1, -1)
+                ),
+                loops,
+                cur_loop_extants,
+                work_needed_inner_loop=tile_size,
+            )
+            loops = loops[: rank + 1] + loops[rank + 1 :][::-1]
+            cur_loop_extants = cur_loop_extants[: rank + 1] + cur_loop_extants[rank + 1 :: -1]
+            sch.reorder(*loops)
+            dim1_loop_tiled = sch.fuse(*loops[rank + 1 :])
+            loops = loops[: rank + 1]
+            loops.append(dim1_loop_tiled)
+            cur_loop_extants = cur_loop_extants[: rank + 1]
+            cur_loop_extants.append(tile_size)
+
+        rank = len(src_layout)
+
+        # Assume write to output global memory is coalesced
+        block_write = sch.get_block(name="T_layout_trans", func_name="main")
+
+        # Outer loop structure of read block matches that of src_layout
+        # E.g. if input_shape is [4, 6, 8]. Loops for read block will be
+        # for i, j, k in T.grid(4, 6, 8):
+        #     ...
+        # Read block will read from global memory coalesced at the start
+        block_read = sch.cache_read(block_write, 0, "shared")
+
+        # Here we have [loop1, loop2, loop3 ... dim0_tiled, dim1_tiled]
+        get_high_level_loop_structure(block_read)
+
+        loops = sch.get_loops(block_read)
+
+        # If there are insufficient elements, than dim1_tiled or dim0_tiled might be too small
+        # In all likelihood you should use a smaller tile, but I don't want things to crash.
+        loops[-1] = pad_dimension_to_at_least_number(loops[-1], tile_size)
+        loops[-2] = pad_dimension_to_at_least_number(loops[-2], tile_size)
+
+        # We want the dim0 and dim1 parent loops to be the inner most. Right now dim1 is inner-msot
+        # and we just need to move dim0 in (last dimension of dst).
+        # Recall right now structure is at least [l1 l2 ... ln, dim0_tiled, dim1_tiled]
+        # where n >= 2.
+        dim0_loop_index = src_layout.index(dst_layout[-1])
+        dim0_loop = loops.pop(dim0_loop_index)
+        loops = loops[:-3] + [dim0_loop, loops[-3]] + loops[-2:]
+        sch.reorder(*loops)
+
+        # After this: [outer_loop (block), dim0_tiled, dim1_tiled]
+        outer_loop = sch.fuse(*loops[:-2])
+
+        # Now that we have the high level loop structure, we can use reverse_compute_at magic
+        # To get the proper loop structure for writing! This is also as coalesced as possible
+        # already.
+        sch.reverse_compute_at(block_write, outer_loop)
+
+        # Fuse all inner loops for the write into 2 loops, grab inner loops for both read
+        # and write block which have locality (we will bind these to threadIdx)
+        fused_write_loop = sch.fuse(*sch.get_loops(block_write)[1:])
+        _, inner_write_loop = sch.split(fused_write_loop, [None, tile_size])
+        inner_read_loop = sch.get_loops(block_read)[-2]
+
+        sch.bind(loop=outer_loop, thread_axis="blockIdx.x")
+        sch.bind(loop=inner_write_loop, thread_axis="threadIdx.x")
+        sch.bind(loop=inner_read_loop, thread_axis="threadIdx.x")
+
+    # tile_size = sch.sample_categorical([8, 16, 32, 64], [0.25] * 4)
+    schedule_layout_transform_v4(sch, src_layout, dst_layout, input_shape, 32)
+    return [sch]
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 733b630fc4da..2b29f5c28954 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -21,12 +21,8 @@
 
 from tvm import _ffi, ir, te, topi
 from tvm.target import generic_func, override_native_generic_func
-from tvm.topi.utils import (
-    get_const_float,
-    get_const_int,
-    get_const_tuple,
-    get_float_tuple,
-)
+from tvm.topi.utils import (get_const_float, get_const_int, get_const_tuple,
+                            get_float_tuple)
 
 from .. import op as _op
 
@@ -2060,3 +2056,31 @@ def conv2d_backward_weight_strategy(attrs, inputs, out_type, target):
         "conv2d_backward_weight is currently only supported with cudnn. "
         "Please run Legalize pass to decompose this op into supported ops."
     )
+
+
+@override_native_generic_func("layout_transform_strategy")
+def layout_transform_strategy(attrs, inputs, out_type, target):
+    """layout transform generic strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_layout_transform(topi.layout_transform),
+        wrap_topi_schedule(topi.generic.schedule_injective),
+        name="layout_transform.generic",
+    )
+    return strategy
+
+
+def wrap_compute_layout_transform(topi_compute, schedule_rule=""):
+    """Wrap stft compute"""
+
+    def _compute_layout_transform(attrs, inputs, output_type):
+        return [
+            topi_compute(
+                inputs[0],
+                attrs.src_layout,
+                attrs.dst_layout,
+                schedule_rule,
+            )
+        ]
+
+    return _compute_layout_transform
diff --git a/python/tvm/topi/cuda/batch_matmul.py b/python/tvm/topi/cuda/batch_matmul.py
index d2f5c9b9c586..829d2d68940d 100644
--- a/python/tvm/topi/cuda/batch_matmul.py
+++ b/python/tvm/topi/cuda/batch_matmul.py
@@ -17,12 +17,12 @@
 # pylint: disable=invalid-name,too-many-locals,unused-variable,unused-argument
 """cuda batch_matmul operators"""
 import tvm
-from tvm import autotvm
-from tvm import te
+from tvm import autotvm, te
+from tvm.autotvm.task.space import OtherOptionEntity, SplitEntity
 from tvm.contrib import cublas
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
-from .. import nn, generic
-from ..utils import traverse_inline, get_const_tuple, get_max_power2_factor
+
+from .. import generic, nn
+from ..utils import get_const_tuple, get_max_power2_factor, traverse_inline
 from .tensor_intrin import dp4a
 
 
@@ -87,6 +87,7 @@ def schedule_batch_matmul(cfg, outs):
     s: Schedule
         The computation schedule for the op.
     """
+    breakpoint()
     outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
     s = te.create_schedule([x.op for x in outs])
 
diff --git a/python/tvm/topi/transform.py b/python/tvm/topi/transform.py
index 23334da9c25c..e4fe3c583990 100644
--- a/python/tvm/topi/transform.py
+++ b/python/tvm/topi/transform.py
@@ -17,13 +17,13 @@
 # pylint: disable=invalid-name,consider-using-enumerate,redefined-outer-name
 """Injective transformation operators"""
 from __future__ import absolute_import as _abs
+
 import tvm
-from tvm import te
-from tvm import topi
+from tvm import te, topi
 from tvm.te import hybrid
-from . import cpp
-from . import tag
-from .utils import within_index, make_idx, const_vector
+
+from . import cpp, tag
+from .utils import const_vector, make_idx, within_index
 
 
 def expand_dims(a, axis, num_newaxis=1):
@@ -636,7 +636,7 @@ def tile(a, reps):
     return cpp.tile(a, reps)
 
 
-def layout_transform(array, src_layout, dst_layout):
+def layout_transform(array, src_layout, dst_layout, schedule_rule="None"):
     """Transform the layout according to src_layout and dst_layout
 
     Parameters
@@ -649,8 +649,11 @@ def layout_transform(array, src_layout, dst_layout):
 
     dst_layout : str
         the destination layout.
+
+    schedule_rule : str
+        the schedule rule to apply if any
     """
-    return cpp.layout_transform(array, src_layout, dst_layout)
+    return cpp.layout_transform(array, src_layout, dst_layout, schedule_rule)
 
 
 def shape(array, dtype="int32"):
diff --git a/python/tvm/topi/x86/new.py b/python/tvm/topi/x86/new.py
new file mode 100644
index 000000000000..e611b5a84a51
--- /dev/null
+++ b/python/tvm/topi/x86/new.py
@@ -0,0 +1,12 @@
+import tensorflow as tf
+
+indices = tf.constant([[1], [1]])
+updates = tf.constant(
+    [
+        [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+        [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+    ]
+)
+shape = tf.constant([4, 4, 4])
+scatter = tf.scatter_nd(indices, updates, shape)
+print(scatter)
diff --git a/src/tir/schedule/primitive/compute_at.cc b/src/tir/schedule/primitive/compute_at.cc
index 988c73c3f071..e027cfdd5742 100644
--- a/src/tir/schedule/primitive/compute_at.cc
+++ b/src/tir/schedule/primitive/compute_at.cc
@@ -706,6 +706,11 @@ void ComputeAtOrReverseComputeAtImpl(ScheduleState self, const StmtSRef& block_s
                               /*provided_regions=*/std::move(provided_regions),
                               /*required_regions=*/std::move(required_regions),
                               /*analyzer=*/analyzer);
+
+  for (int i = 0; i < iter_doms.size(); i++) {
+    //LOG(WARNING) << i << " " << iter_doms[i].dom << "  " << iter_doms[i].bound << std::endl;
+  }
+
   // Step 6. Create the new scope according to the iteration domain
   reconstructor.MakeNewLoop(/*insert_position=*/insert_position, /*iter_doms=*/std::move(iter_doms),
                             /*analyzer=*/analyzer, /*preserve_unit_loops=*/preserve_unit_loops);
diff --git a/src/topi/transform.cc b/src/topi/transform.cc
index 0ea1392e5daf..bbefa19c2055 100644
--- a/src/topi/transform.cc
+++ b/src/topi/transform.cc
@@ -87,7 +87,7 @@ TVM_REGISTER_GLOBAL("topi.split").set_body([](TVMArgs args, TVMRetValue* rv) {
 });
 
 TVM_REGISTER_GLOBAL("topi.layout_transform").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = layout_transform(args[0], args[1], args[2]);
+  *rv = layout_transform(args[0], args[1], args[2], args[3]);
 });
 
 TVM_REGISTER_GLOBAL("topi.take").set_body([](TVMArgs args, TVMRetValue* rv) {

From 8748bc819d30b61877d7305ac335b555046562ae Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Wed, 1 Mar 2023 15:40:46 -0800
Subject: [PATCH 02/40] Generated all the tile sizes

---
 python/tvm/relay/op/strategy/cuda.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 404b5eb424a1..b3d7909c614b 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -1408,7 +1408,6 @@ def layout_transform_strategy(attrs, inputs, out_type, target):
     return strategy
 
 
-# "meta_schedule.cuda.layout_transform"
 import tvm
 
 @tvm.register_func("meta_schedule.cuda.layout_transform")
@@ -1551,11 +1550,11 @@ def factor_dim_in_order(
         ):
             """TODO"""
             for i in indices:
-                if work_needed_inner_loop == 1:
-                    break
                 loops, cur_loop_extants, work_needed_inner_loop = spin_out_factor(
                     loops, cur_loop_extants, i, work_needed_inner_loop
                 )
+                if work_needed_inner_loop == 1:
+                    break
             return loops, cur_loop_extants
 
         def get_high_level_loop_structure(block):
@@ -1654,6 +1653,16 @@ def get_high_level_loop_structure(block):
         sch.bind(loop=inner_write_loop, thread_axis="threadIdx.x")
         sch.bind(loop=inner_read_loop, thread_axis="threadIdx.x")
 
-    # tile_size = sch.sample_categorical([8, 16, 32, 64], [0.25] * 4)
-    schedule_layout_transform_v4(sch, src_layout, dst_layout, input_shape, 32)
-    return [sch]
+    schedules = []
+
+    # Tile size 2,3,4...64
+    # Tile size of 1 does not make sense...
+    for tile_size in range(2, 65):
+        cur_sch = sch.copy()
+        schedule_layout_transform_v4(cur_sch, src_layout, dst_layout, input_shape, tile_size)
+        schedules.append(cur_sch)
+
+    # Also include the default schedules which will be handled via AutoBind schedule rule
+    schedules.append(sch)
+
+    return schedules

From 54d36255ffc28f676cee0537bc837839bc3f4484 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Thu, 2 Mar 2023 12:34:08 -0800
Subject: [PATCH 03/40] is this all you need?

---
 python/tvm/relay/op/strategy/cuda.py | 34 ++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index b3d7909c614b..6c600534cd28 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -1430,6 +1430,7 @@ def cuda_layout_transform_schedule_rule(sch, block):
 
     def schedule_layout_transform_v4(
         sch: tvm.tir.Schedule,
+        block_write: BlockRV,
         src_layout: str,
         dst_layout: str,
         input_shape: List[int],
@@ -1606,14 +1607,12 @@ def get_high_level_loop_structure(block):
 
         rank = len(src_layout)
 
-        # Assume write to output global memory is coalesced
-        block_write = sch.get_block(name="T_layout_trans", func_name="main")
-
         # Outer loop structure of read block matches that of src_layout
         # E.g. if input_shape is [4, 6, 8]. Loops for read block will be
         # for i, j, k in T.grid(4, 6, 8):
         #     ...
         # Read block will read from global memory coalesced at the start
+        # Assume write to output global memory is coalesced in block_write
         block_read = sch.cache_read(block_write, 0, "shared")
 
         # Here we have [loop1, loop2, loop3 ... dim0_tiled, dim1_tiled]
@@ -1653,16 +1652,41 @@ def get_high_level_loop_structure(block):
         sch.bind(loop=inner_write_loop, thread_axis="threadIdx.x")
         sch.bind(loop=inner_read_loop, thread_axis="threadIdx.x")
 
+    from collections import deque
+    def auto_inline(start_block):
+        # BFS from start block in a chain (no branches)
+        fringe = deque([start_block])
+        visited = set()
+        while len(fringe) > 0:
+            cur_block = fringe.popleft()
+            if cur_block in visited:
+                continue 
+            else:
+                visited.add(cur_block)
+                
+            consumer_blocks = sch.get_consumers(cur_block)
+            
+            if len(consumer_blocks) >= 1:
+                fringe.extend(consumer_blocks)
+                sch.compute_inline(cur_block)
+            else:
+                # consumer yay!
+                return cur_block
+
     schedules = []
+    
+    # For each schedule we also want to inline each stage as would be done in normal circumstances
+    # The block which producers the layout transform block seems 
+    block = auto_inline(block)
 
     # Tile size 2,3,4...64
     # Tile size of 1 does not make sense...
     for tile_size in range(2, 65):
         cur_sch = sch.copy()
-        schedule_layout_transform_v4(cur_sch, src_layout, dst_layout, input_shape, tile_size)
+        schedule_layout_transform_v4(cur_sch, block, src_layout, dst_layout, input_shape, tile_size)
         schedules.append(cur_sch)
 
     # Also include the default schedules which will be handled via AutoBind schedule rule
     schedules.append(sch)
-
+        
     return schedules

From 6ccea68079ea89411b79735eec26d1fe7a92c61b Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Thu, 2 Mar 2023 15:34:52 -0800
Subject: [PATCH 04/40] linting

lint

move schedule rule to own file

lint p2

layout transform fixings
---
 include/tvm/topi/transform.h                  |  10 +-
 python/test_2.py                              |  71 ----
 .../meta_schedule/schedule/cuda/__init__.py   |   2 +
 .../schedule/cuda/layout_transform.py         | 321 ++++++++++++++++++
 python/tvm/relay/op/strategy/cuda.py          | 285 +---------------
 python/tvm/relay/op/strategy/generic.py       |   5 +-
 python/tvm/topi/cuda/batch_matmul.py          |  11 +-
 python/tvm/topi/x86/new.py                    |  12 -
 src/tir/schedule/primitive/compute_at.cc      |   5 -
 9 files changed, 335 insertions(+), 387 deletions(-)
 delete mode 100644 python/test_2.py
 create mode 100644 python/tvm/meta_schedule/schedule/cuda/layout_transform.py
 delete mode 100644 python/tvm/topi/x86/new.py

diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index 51058485838f..0d0c5d5962ce 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -1599,7 +1599,6 @@ inline Tensor layout_transform(const Tensor& src, const std::string& src_layout,
                                const std::string schedule_rule = "None",
                                const std::string name = "T_layout_trans",
                                const std::string tag = kInjective) {
-  // LOG(FATAL) << "WHATTT";
   Layout src_layout_struct(src_layout);
   Layout dst_layout_struct(dst_layout);
 
@@ -1616,11 +1615,10 @@ inline Tensor layout_transform(const Tensor& src, const std::string& src_layout,
 
   Array<PrimExpr> dst_shape = layout_converter.ForwardShape(src->shape);
 
-  Map<String, ObjectRef> attrs = {
-    {"schedule_rule", String(schedule_rule)}, 
-    {"src_layout", String(src_layout)},
-    {"dst_layout", String(dst_layout)}
-  };
+  Map<String, ObjectRef> attrs = {{"schedule_rule", String(schedule_rule)},
+                                  // Information about layouts needed for the schedule rule
+                                  {"src_layout", String(src_layout)},
+                                  {"dst_layout", String(dst_layout)}};
 
   return compute(
       dst_shape,
diff --git a/python/test_2.py b/python/test_2.py
deleted file mode 100644
index e0ddc33d1586..000000000000
--- a/python/test_2.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import onnx
-
-import tvm.relay
-from tvm import meta_schedule as ms
-
-mod, params = tvm.relay.frontend.from_onnx(
-    onnx.load("resnet50-v1-12-int8.onnx"), shape={"data": [1, 3, 224, 224]}, freeze_params=True
-)
-
-
-def apply_relay_passes(
-    mod: tvm.IRModule,
-):
-    """Applies relay passes to the input IRModule.
-
-    :param mod: The input IRModule
-    :return: The IRModule after all the relays passes have been applied
-    """
-    # N.B. Defer the import so as not to unconditionally require other runtimes.
-    from tvm import relay, transform
-    from tvm.relay.op.contrib.dnnl import rewrite_layer_norm
-
-    mod = rewrite_layer_norm(mod)
-
-    passes = []
-
-    # If the inputs are static, run DynamicToStatic to remove
-    # any residual dynamism in the model.
-    # If the inputs are dynamic, this pass is much more expensive
-    # and will not remove dynamism from the model, so we skip it.
-    passes.append(relay.transform.DynamicToStatic())
-
-    # Infer types prior to the quantization pass below as some
-    # transforms might need them.
-    passes.append(relay.transform.InferType())
-
-    # Transform fake quantized sub-graphs to actual integer ops.
-    # Should have no effect on graphs without the relevant patterns.
-    passes.append(relay.transform.FakeQuantizationToInteger())
-
-    passes.append(relay.transform.FastMath())
-
-    # Fold constants after FQ2I becuase some weights are stored in FP32.
-    passes.append(relay.transform.FoldConstant())
-
-    # Use sequential to solve for dependent passes
-    seq = transform.Sequential(passes)
-
-    with tvm.transform.PassContext(opt_level=4):
-        mod = seq(mod)
-
-    mod = relay.transform.InferType()(mod)
-    # mod["main"] = rewrite(relay.qnn.transform.LayerNormQuantizedRewrite(), mod["main"])
-
-    return mod
-
-
-mod = apply_relay_passes(mod)
-
-print(mod)
-target = tvm.target.Target("nvidia/geforce-rtx-3070")
-work_dir = "resnet_tune"
-
-db = ms.relay_integration.tune_relay(mod, params, target, work_dir, 1000)
-
-"""
-Works:
-Does not:
-57de9e7f3d2711582368903ce95f08b91216b7b5    Mon Nov 28 21:28:37 2022 -0800
-
-"""
diff --git a/python/tvm/meta_schedule/schedule/cuda/__init__.py b/python/tvm/meta_schedule/schedule/cuda/__init__.py
index 937a6e16a91b..ce79d15cc4b4 100644
--- a/python/tvm/meta_schedule/schedule/cuda/__init__.py
+++ b/python/tvm/meta_schedule/schedule/cuda/__init__.py
@@ -15,3 +15,5 @@
 # specific language governing permissions and limitations
 # under the License.
 """Per-block schedule rules in MetaSchedule for target key 'cuda'"""
+
+from . import layout_transform
diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
new file mode 100644
index 000000000000..e064121e8514
--- /dev/null
+++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
@@ -0,0 +1,321 @@
+import tvm
+from tvm import topi
+import math
+from typing import List, Sequence, Tuple
+
+from tvm.tir.schedule import BlockRV, ExprRV, LoopRV
+from collections import deque
+
+
+def tile_layout_transform(
+    sch: tvm.tir.Schedule,
+    block_write: BlockRV,
+    src_layout: str,
+    dst_layout: str,
+    input_shape: List[int],
+    tile_size: ExprRV,
+):
+    """
+    High level tiling for layout transform block.
+    """
+    
+    ## Tiling layout transforms:
+    # Assume we have an input shape of [A, B, C, D] and want to layout transform
+    # ABCD --> DBAC so the output shape would be [D, B, A, C].
+    #
+    # Consider reading from the input buffer in a cache-friendly fashion on CPU. We would
+    # expect a loop structure like:
+    # lAr, lBr, lCr, lDr = T.grid(A, B, C, D)
+    #
+    # Meanwhile consider writing to the output buffer in a cache-friendly fashion on CPU:
+    # lDw, lBw, lAw, lCw = T.grid(D, B, A, C)
+    #
+    # Clearly in many scenarios it is impossible to guarantee contiguous writes and reads
+    # within a single loop. Due to non-adjacent dimensions. Instead we work on transposing some 
+    # small sub-tensor of our input writing and then reading from shared memory. We must now 
+    # construct our submatrix so that reading and writing can both be done with some contiguous  
+    # access in global memory.
+    #
+    # Consider the case of a 2D transpose. For example [1024, 2048] -> [2048, 1024].
+    # We note that if we deal with a submatrix of shape [32, 32] which corresponds 
+    # to the dimension of our input tensor, then rows of the submatrix are contiguous
+    # in the input tensor. Meanwhile, columns of our submatrix are contiguous in our 
+    # output vector. Therefore, with this tile shape we have opportunity to read
+    # contiguously in our input tensor and write to shared memory, and write contiguously
+    # to our output tensor.
+    #
+    # The multiple dimensional case has a similar analogue. We want to allocate shared
+    # memory per block of [`tile_size`, `tile_size`]. We want the inner most dimension
+    # of our shared memory to correspond to contiguous reads from the input tensor and
+    # the outer dimension to correspond to contiguous writes into the output tensor.
+    # 
+    # In terms of the loop structure reading from the input tensor, the inner most loops
+    # of our tile must correspond to the inner most dimensions of the input shape,  
+    # while the outer dimensions correspond to the inner most dimensions of the output shape.
+    # To obtain an inner tile with this loop structure we factor out a contiguous `tile_size`
+    # chunk of our loop in the shape of interest. 
+    # 
+    # An example is probably best to show this idea:
+    # Let's say we want a layout transform of ABCD --> DCAB. With shape
+    # [1024_a, 2_b, 32_c, 8_d] --> [8_d, 32_c, 1024_a, 2_b]
+    #
+    # And tile size 32.
+    #
+    # Then we initially have a coalesced-read loop pattern of:
+    # T.grid(1024_a, 2_b, 32_c, 8_d)
+    # 
+    # To obtain an inner tile of 32, we factor 4 from 32_c and 8 from 8_d:
+    # T.grid(1024_a, 2_b, 8_c1, 1_d1, 4_c2t, 8_d2t)
+    # T.grid(1024_a, 2_b, 8_cr, 1_dr, 32_dim1)
+    #
+    # To obtain an outer tile of 32, we factor from B then A to follow contiguous write 
+    # pattern:
+    #
+    # T.grid(64_a1, 1_b1, 8_cr, 1_dr, 16_a2t, 2_b2t, 32_dim1)
+    # T.grid(64_ar, 1_br, 8_cr, 1_dr, 32_dim0, 32_dim1)
+    #
+    # Which allows us to read a tile with our wanted properties.
+    # For writing we use the existing analysis infrastructure to generate the proper structure for writing.
+
+    def pad_dimension_to_at_least_number(loop: LoopRV, requested_size: int):
+        """E.g. if loop has extant of 8 but we want 10, returns size 10 loop with padding."""
+        l1, l2 = sch.split(loop, [None, requested_size])
+        return sch.fuse(l1, l2)
+
+    def pad_dimension_to_factor_of_tile_size(
+        loop: LoopRV, initial_size: int, tile_size: int = tile_size
+    ) -> Tuple[LoopRV, int]:
+        """
+        Pads loop of given size until it is divisble into tile_size.
+        If the given size of the loop is greater than tile size. Do not pad.
+
+        example, loop_size = 5, tile_size = 32. loop_size --> 8
+                loop_size = 5, tile_size = 36. loop_size --> 6
+                loop_size = 8, tile_size = 32. loop_size --> 8
+                loop_size = 33, tile_size = 32. loop_size --> 33
+
+        Returns padded loopRV and the new size
+        """
+        if tile_size % initial_size == 0:
+            return loop, int(initial_size)
+
+        if initial_size > tile_size or initial_size == tile_size:
+            return loop, int(initial_size)
+
+        # if initial_size > tile_size return without change, factor = 1
+        size = initial_size
+        while (tile_size % size) % tile_size > 0:
+            size += 1
+
+        return pad_dimension_to_at_least_number(loop, size), int(size)
+
+    def spin_out_factor(
+        loops: List[LoopRV], loop_extants: List[int], index: int, factor_needed: int
+    ) -> Tuple[List[LoopRV], List[int], int]:
+        """
+        Factor out loop dimensions to reach the requested factor. Updates the schedule in-place.
+
+        E.g. say we want to factors which eventually multiply to 32 (factor_needed).
+
+        Say we have the index we chose is a loop with an extant of 8.
+        E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed = 32, index = 3
+            - 8 divides into 32 so we just split up the loop into two loops with extants 1 and 8.
+            - we then keep the 1-loop in place and move the new 8-loop to back of the list of loops
+            - ending loops / loop_extants = [3, 32, 6, 1, 8], remaining_factor_needed = 32 / 8 = 4
+
+        E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed=32, index = 0
+            - 3 does not divide 32, so we pad until the extant divides 32, e.g. 4
+            - we then split up the loop into extants 1 and 4, moving the 4 to the back
+            - ending loops / loop_extants = [1, 32, 6, 8, 4], remaining_factor_needed = 32 / 4 = 8
+
+        E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed=5, index = 3
+            - 8 is larger than 5 so we immediately do the splitting routine.
+            - the 8 extant loop becomes loops with extants 2 and 5
+            - ending loops / loop_extants = [1, 32, 6, 2, 5], remaining_factor_needed = 5 / 5 = 1
+
+        After updating loop ordering in place, returns the new list of loops, extants, and the
+        remaining factor needed.
+        """
+        cur_loop = loops[index]
+        cur_extant = loop_extants[index]
+
+        # Pad loops to divide evenly for factors needed, and split
+        new_loop, new_size = pad_dimension_to_factor_of_tile_size(
+            cur_loop, cur_extant, tile_size=factor_needed
+        )
+
+        split_factor = min(new_size, factor_needed)
+        new_loop_split, factored_loop = sch.split(new_loop, [None, split_factor])
+        factor_needed = factor_needed // split_factor
+
+        # update caching
+        loops[index] = new_loop_split
+        loops.append(factored_loop)
+
+        loop_extants[index] = math.ceil(new_size / split_factor)
+        loop_extants.append(split_factor)
+
+        sch.reorder(*loops)
+        return loops, loop_extants, factor_needed
+
+    def factor_dim_in_order(
+        indices: Sequence[int],
+        loops: List[LoopRV],
+        cur_loop_extants: List[int],
+        work_needed_inner_loop: int = tile_size,
+    ):
+        """Factors out the loops in the order of indices until we reach needed work.
+        
+        Adds new loop factors to the back in reverse order of access.
+        """
+        for i in indices:
+            loops, cur_loop_extants, work_needed_inner_loop = spin_out_factor(
+                loops, cur_loop_extants, i, work_needed_inner_loop
+            )
+            if work_needed_inner_loop == 1:
+                break
+        return loops, cur_loop_extants
+
+    def get_high_level_loop_structure(block):
+        """Runs the factorization described above."""
+        # index 0 ... rank - 1 will always correspond to original loops
+        # perhaps after they have been factored.
+        loops = sch.get_loops(block)
+        cur_loop_extants = list(input_shape)
+
+        # Factor dim0 tile size and fuse things together
+        loops, cur_loop_extants = factor_dim_in_order(
+            range(rank - 1, -1, -1),
+            loops,
+            cur_loop_extants,
+            work_needed_inner_loop=tile_size,
+        )
+        # The factors which multiply to tile_size are now in back of our
+        # list of loops. However because we added them by traversing the inner
+        # dimensions, they are actually reversed order to guarantee the best access
+        # so reorder so reorder before fusing.
+        loops = loops[:rank] + loops[rank:][::-1]
+        cur_loop_extants = cur_loop_extants[:rank] + cur_loop_extants[rank::-1]
+        sch.reorder(*loops)
+        dim0_loop_tiled = sch.fuse(*loops[rank:])
+        loops = loops[:rank]
+        loops.append(dim0_loop_tiled)
+        cur_loop_extants = cur_loop_extants[:rank]
+        cur_loop_extants.append(tile_size)
+
+        # Same thing with dim1
+        # [:rank + 1], since we placed dim0_loop_tiled in the end which we want to keep
+        loops, cur_loop_extants = factor_dim_in_order(
+            (
+                src_layout.index(dst_layout[loop_index_dst])
+                for loop_index_dst in range(rank - 1, -1, -1)
+            ),
+            loops,
+            cur_loop_extants,
+            work_needed_inner_loop=tile_size,
+        )
+        loops = loops[: rank + 1] + loops[rank + 1 :][::-1]
+        cur_loop_extants = cur_loop_extants[: rank + 1] + cur_loop_extants[rank + 1 :: -1]
+        sch.reorder(*loops)
+        dim1_loop_tiled = sch.fuse(*loops[rank + 1 :])
+        loops = loops[: rank + 1]
+        loops.append(dim1_loop_tiled)
+        cur_loop_extants = cur_loop_extants[: rank + 1]
+        cur_loop_extants.append(tile_size)
+
+    rank = len(src_layout)
+
+    # Outer loop structure of read block matches that of src_layout
+    # E.g. if input_shape is [4, 6, 8]. Loops for read block will be
+    # for i, j, k in T.grid(4, 6, 8):
+    #     ...
+    # Read block will read from global memory coalesced at the start
+    # Assume write to output global memory is coalesced in block_write
+    block_read = sch.cache_read(block_write, 0, "shared")
+
+    # Here we have [loop1, loop2, loop3 ... dim0_tiled, dim1_tiled]
+    get_high_level_loop_structure(block_read)
+    loops = sch.get_loops(block_read)
+
+    # If there are insufficient elements, than dim1_tiled or dim0_tiled might be too small
+    # In all likelihood you should use a smaller tile, but I don't want things to crash.
+    loops[-1] = pad_dimension_to_at_least_number(loops[-1], tile_size)
+    loops[-2] = pad_dimension_to_at_least_number(loops[-2], tile_size)
+
+    # We want the dim0 and dim1 parent loops to be the inner most. Right now dim1 is inner-msot
+    # and we just need to move dim0 in (last dimension of dst).
+    # Recall right now structure is at least [l1 l2 ... ln, dim0_tiled, dim1_tiled]
+    # where n >= 2.
+    dim0_loop_index = src_layout.index(dst_layout[-1])
+    dim0_loop = loops.pop(dim0_loop_index)
+    loops = loops[:-3] + [dim0_loop, loops[-3]] + loops[-2:]
+    sch.reorder(*loops)
+
+    # After this: [outer_loop (block binding), dim0_tiled, dim1_tiled]
+    outer_loop = sch.fuse(*loops[:-2])
+
+    # Now that we have the high level loop structure, we can use reverse_compute_at magic
+    # To get the proper loop structure for writing! This is also as coalesced as possible
+    # already.
+    sch.reverse_compute_at(block_write, outer_loop)
+
+    # Fuse all inner loops for the write into 2 loops, grab inner loops for both read
+    # and write block which have locality (we will bind these to threadIdx)
+    fused_write_loop = sch.fuse(*sch.get_loops(block_write)[1:])
+    _, inner_write_loop = sch.split(fused_write_loop, [None, tile_size])
+    inner_read_loop = sch.get_loops(block_read)[-2]
+
+    sch.bind(loop=outer_loop, thread_axis="blockIdx.x")
+    sch.bind(loop=inner_write_loop, thread_axis="threadIdx.x")
+    sch.bind(loop=inner_read_loop, thread_axis="threadIdx.x")
+
+def auto_inline(start_block):
+    # Autoinlines given block into consumers, and repeats process for consumer of block
+    # Done by default for injective schedules.
+    fringe = deque([start_block])
+    visited = set()
+    while len(fringe) > 0:
+        cur_block = fringe.popleft()
+        if cur_block in visited:
+            continue
+        else:
+            visited.add(cur_block)
+
+        consumer_blocks = sch.get_consumers(cur_block)
+        if len(consumer_blocks) >= 1:
+            fringe.extend(consumer_blocks)
+            sch.compute_inline(cur_block)
+        else:
+            # Found output block, no more inlining needed
+            return cur_block
+
+
+@tvm.register_func("meta_schedule.cuda.layout_transform")
+def cuda_layout_transform_schedule_rule(sch, block):
+    # params: input_buffer, output_buffer
+    params = sch.mod["main"].params
+    input_buffer = sch.mod["main"].buffer_map[params[0]]
+    output_buffer = sch.mod["main"].buffer_map[params[1]]
+    
+    # Info needed for tiling
+    input_shape = [int(dim) for dim in input_buffer.shape]
+    output_shape = [int(dim) for dim in output_buffer.shape]
+    src_layout = sch.get_sref(block).stmt.annotations["src_layout"]
+    dst_layout = sch.get_sref(block).stmt.annotations["dst_layout"]
+
+    # For each schedule we also want to inline each stage as would be done in normal circumstances
+    # to prevent extraneous memory access.
+    block = auto_inline(block)
+
+    schedules = []
+
+    # Tile size 2,3,4...64 as tile size of 1 has no coaslescing.
+    for tile_size in range(2, 65):
+        cur_sch = sch.copy()
+        tile_layout_transform(cur_sch, block, src_layout, dst_layout, input_shape, tile_size)
+        schedules.append(cur_sch)
+
+    # Also include the default schedules which will be handled via AutoBind schedule rule
+    schedules.append(sch)
+
+    return schedules
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 6c600534cd28..6111ceed0f98 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -1397,6 +1397,7 @@ def dft_strategy_cuda(attrs, inputs, out_type, target):
     )
     return strategy
 
+
 @layout_transform_strategy.register(["cuda", "gpu"])
 def layout_transform_strategy(attrs, inputs, out_type, target):
     strategy = _op.OpStrategy()
@@ -1406,287 +1407,3 @@ def layout_transform_strategy(attrs, inputs, out_type, target):
         name="layout_transform.cuda",
     )
     return strategy
-
-
-import tvm
-
-@tvm.register_func("meta_schedule.cuda.layout_transform")
-def cuda_layout_transform_schedule_rule(sch, block):
-    # params: input_buffer, output_buffer
-    params = sch.mod["main"].params
-    input_buffer = sch.mod["main"].buffer_map[params[0]]
-    output_buffer = sch.mod["main"].buffer_map[params[1]]
-
-    input_shape = [int(dim) for dim in input_buffer.shape]
-    output_shape = [int(dim) for dim in output_buffer.shape]
-
-    src_layout = sch.get_sref(block).stmt.annotations["src_layout"]
-    dst_layout = sch.get_sref(block).stmt.annotations["dst_layout"]
-
-    import math
-    from typing import List, Sequence, Tuple
-
-    from tvm.tir.schedule import BlockRV, ExprRV, LoopRV
-
-    def schedule_layout_transform_v4(
-        sch: tvm.tir.Schedule,
-        block_write: BlockRV,
-        src_layout: str,
-        dst_layout: str,
-        input_shape: List[int],
-        tile_size: ExprRV,
-    ):
-        ## Tiling block_read
-        # Let N and M represent the dimensions of interest
-        # N and M are the last dim of src_layout and dst_layout respectively.
-        # Then the initial block's loop will look like
-        # [i1, i2 ... M ... j1, j2 ... N]
-        #
-        # To guarantee contiguous read for N, we must group reads
-        # so that loops which factor N, j_n, j_n-1 ... are the innermost dimension
-        # Therefore our strategy for guaranteeing contiguous writes for N is by
-        # continually splitting inner-most dimension in order of N, j_n, j_n-1...
-        # by factors which divide into tile_size until we have the final tile_size.
-        # e.g. if tile_size = 32. N = 2, j_n = 2, j_n-1 = 4, j_n-2 = 24
-        # Then by combining N...j_n-1 we get a factor of 16, by spliiting up j_n-2 into
-        # two loops of 2 and 12, we can combine with the new loop of factor 2 to get a
-        # factor of 32. Note things don't divide evenly often time so we may have to pad
-        # to properly factorize.
-        #
-        # Similarly with M, to have contiguous writes we must consider the dst_layout:
-        # [a1, a2 ... N ... b1, b2 ... M]
-        # So that loops which factor M, b_m, b_m-1 ... are the innermost dimension
-        # Note that the dimension b_m, b_m-1 and j_n, j_n-1 may refer to the same dimension!
-        # By factoring j_n, j_n-1 we may be contributing to the innermost write dimension for
-        # M. However we note that both reads and writes must ideally have the same amount of
-        # work per thread for layout transforms so we still must build out to factors up
-        # to tile_size.
-
-        def pad_dimension_to_at_least_number(loop: LoopRV, requested_size: int):
-            """E.g. if loop has extant of 8 but we want 10, returns size 10 loop with padding"""
-            l1, l2 = sch.split(loop, [None, requested_size])
-            return sch.fuse(l1, l2)
-
-        def pad_dimension_to_factor_of_tile_size(
-            loop: LoopRV, initial_size: int, tile_size: int = tile_size
-        ) -> Tuple[LoopRV, int]:
-            """
-            Pads loop of given size until it is divisble into tile_size.
-            If the given size of the loop is greater than tile size. Do not pad.
-
-            example, loop_size = 5, tile_size = 32. loop_size --> 8
-                    loop_size = 5, tile_size = 36. loop_size --> 6
-                    loop_size = 8, tile_size = 32. loop_size --> 8
-                    loop_size = 33, tile_size = 32. loop_size --> 33
-
-            Returns padded loopRV and the new size
-            """
-            if tile_size % initial_size == 0:
-                return loop, int(initial_size)
-
-            if initial_size > tile_size or initial_size == tile_size:
-                return loop, int(initial_size)
-
-            # if initial_size > tile_size return without change, factor = 1
-            size = initial_size
-            while (tile_size % size) % tile_size > 0:
-                size += 1
-
-            return pad_dimension_to_at_least_number(loop, size), int(size)
-
-        def spin_out_factor(
-            loops: List[LoopRV], loop_extants: List[int], index: int, factor_needed: int
-        ) -> Tuple[List[LoopRV], List[int], int]:
-            """
-            Factor out loop extant to reach the requested factor. Updates the schedule in-place.
-
-            E.g. say we want to factors which eventually multiply to 32 (factor_needed).
-
-            Say we have the index we chose is a loop with an extant of 8.
-            E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed = 32, index = 3
-                - 8 divides into 32 so we just split up the loop into two loops with extants 1 and 8.
-                - we then keep the 1-loop in place and move the new 8-loop to back of the list of loops
-                - ending loops / loop_extants = [3, 32, 6, 1, 8], remaining_factor_needed = 32 / 8 = 4
-
-            E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed=32, index = 0
-                - 3 does not divide 32, so we pad until the extant divides 32, e.g. 4
-                - we then split up the loop into extants 1 and 4, moving the 4 to the back
-                - ending loops / loop_extants = [1, 32, 6, 8, 4], remaining_factor_needed = 32 / 4 = 8
-
-            E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed=5, index = 3
-                - 8 is larger than 5 so we immediately do the splitting routine.
-                - the 8 extant loop becomes loops with extants 2 and 5
-                - ending loops / loop_extants = [1, 32, 6, 2, 5], remaining_factor_needed = 5 / 5 = 1
-
-            After updating loop ordering in place, returns the new list of loops, extants, and the
-            remaining factor needed.
-            """
-            cur_loop = loops[index]
-            cur_extant = loop_extants[index]
-
-            # Pad loops to divide evenly for factors needed, and split
-            new_loop, new_size = pad_dimension_to_factor_of_tile_size(
-                cur_loop, cur_extant, tile_size=factor_needed
-            )
-
-            split_factor = min(new_size, factor_needed)
-            new_loop_split, factored_loop = sch.split(new_loop, [None, split_factor])
-            factor_needed = factor_needed // split_factor
-
-            # update caching
-            loops[index] = new_loop_split
-            loops.append(factored_loop)
-
-            loop_extants[index] = math.ceil(new_size / split_factor)
-            loop_extants.append(split_factor)
-
-            sch.reorder(*loops)
-            return loops, loop_extants, factor_needed
-
-        def factor_dim_in_order(
-            indices: Sequence[int],
-            loops: List[LoopRV],
-            cur_loop_extants: List[int],
-            work_needed_inner_loop: int = tile_size,
-        ):
-            """TODO"""
-            for i in indices:
-                loops, cur_loop_extants, work_needed_inner_loop = spin_out_factor(
-                    loops, cur_loop_extants, i, work_needed_inner_loop
-                )
-                if work_needed_inner_loop == 1:
-                    break
-            return loops, cur_loop_extants
-
-        def get_high_level_loop_structure(block):
-            """Runs the factorization described above."""
-            # index 0 ... rank - 1 will always correspond to original loops
-            # perhaps after they have been factored.
-            loops = sch.get_loops(block)
-            cur_loop_extants = list(input_shape)
-
-            # Factor dim0 tile size and fuse things together
-            loops, cur_loop_extants = factor_dim_in_order(
-                range(rank - 1, -1, -1),
-                loops,
-                cur_loop_extants,
-                work_needed_inner_loop=tile_size,
-            )
-            # The factors which multiply to tile_size are now in back of our
-            # list of loops. However because we added them by traversing the inner
-            # dimensions, they are actually reversed order to guarantee the best access
-            # so reorder so reorder before fusing.
-            loops = loops[:rank] + loops[rank:][::-1]
-            cur_loop_extants = cur_loop_extants[:rank] + cur_loop_extants[rank::-1]
-            sch.reorder(*loops)
-            dim0_loop_tiled = sch.fuse(*loops[rank:])
-            loops = loops[:rank]
-            loops.append(dim0_loop_tiled)
-            cur_loop_extants = cur_loop_extants[:rank]
-            cur_loop_extants.append(tile_size)
-
-            # Same thing with dim1
-            # [:rank + 1], since we placed dim0_loop_tiled in the end which we want to keep
-            loops, cur_loop_extants = factor_dim_in_order(
-                (
-                    src_layout.index(dst_layout[loop_index_dst])
-                    for loop_index_dst in range(rank - 1, -1, -1)
-                ),
-                loops,
-                cur_loop_extants,
-                work_needed_inner_loop=tile_size,
-            )
-            loops = loops[: rank + 1] + loops[rank + 1 :][::-1]
-            cur_loop_extants = cur_loop_extants[: rank + 1] + cur_loop_extants[rank + 1 :: -1]
-            sch.reorder(*loops)
-            dim1_loop_tiled = sch.fuse(*loops[rank + 1 :])
-            loops = loops[: rank + 1]
-            loops.append(dim1_loop_tiled)
-            cur_loop_extants = cur_loop_extants[: rank + 1]
-            cur_loop_extants.append(tile_size)
-
-        rank = len(src_layout)
-
-        # Outer loop structure of read block matches that of src_layout
-        # E.g. if input_shape is [4, 6, 8]. Loops for read block will be
-        # for i, j, k in T.grid(4, 6, 8):
-        #     ...
-        # Read block will read from global memory coalesced at the start
-        # Assume write to output global memory is coalesced in block_write
-        block_read = sch.cache_read(block_write, 0, "shared")
-
-        # Here we have [loop1, loop2, loop3 ... dim0_tiled, dim1_tiled]
-        get_high_level_loop_structure(block_read)
-
-        loops = sch.get_loops(block_read)
-
-        # If there are insufficient elements, than dim1_tiled or dim0_tiled might be too small
-        # In all likelihood you should use a smaller tile, but I don't want things to crash.
-        loops[-1] = pad_dimension_to_at_least_number(loops[-1], tile_size)
-        loops[-2] = pad_dimension_to_at_least_number(loops[-2], tile_size)
-
-        # We want the dim0 and dim1 parent loops to be the inner most. Right now dim1 is inner-msot
-        # and we just need to move dim0 in (last dimension of dst).
-        # Recall right now structure is at least [l1 l2 ... ln, dim0_tiled, dim1_tiled]
-        # where n >= 2.
-        dim0_loop_index = src_layout.index(dst_layout[-1])
-        dim0_loop = loops.pop(dim0_loop_index)
-        loops = loops[:-3] + [dim0_loop, loops[-3]] + loops[-2:]
-        sch.reorder(*loops)
-
-        # After this: [outer_loop (block), dim0_tiled, dim1_tiled]
-        outer_loop = sch.fuse(*loops[:-2])
-
-        # Now that we have the high level loop structure, we can use reverse_compute_at magic
-        # To get the proper loop structure for writing! This is also as coalesced as possible
-        # already.
-        sch.reverse_compute_at(block_write, outer_loop)
-
-        # Fuse all inner loops for the write into 2 loops, grab inner loops for both read
-        # and write block which have locality (we will bind these to threadIdx)
-        fused_write_loop = sch.fuse(*sch.get_loops(block_write)[1:])
-        _, inner_write_loop = sch.split(fused_write_loop, [None, tile_size])
-        inner_read_loop = sch.get_loops(block_read)[-2]
-
-        sch.bind(loop=outer_loop, thread_axis="blockIdx.x")
-        sch.bind(loop=inner_write_loop, thread_axis="threadIdx.x")
-        sch.bind(loop=inner_read_loop, thread_axis="threadIdx.x")
-
-    from collections import deque
-    def auto_inline(start_block):
-        # BFS from start block in a chain (no branches)
-        fringe = deque([start_block])
-        visited = set()
-        while len(fringe) > 0:
-            cur_block = fringe.popleft()
-            if cur_block in visited:
-                continue 
-            else:
-                visited.add(cur_block)
-                
-            consumer_blocks = sch.get_consumers(cur_block)
-            
-            if len(consumer_blocks) >= 1:
-                fringe.extend(consumer_blocks)
-                sch.compute_inline(cur_block)
-            else:
-                # consumer yay!
-                return cur_block
-
-    schedules = []
-    
-    # For each schedule we also want to inline each stage as would be done in normal circumstances
-    # The block which producers the layout transform block seems 
-    block = auto_inline(block)
-
-    # Tile size 2,3,4...64
-    # Tile size of 1 does not make sense...
-    for tile_size in range(2, 65):
-        cur_sch = sch.copy()
-        schedule_layout_transform_v4(cur_sch, block, src_layout, dst_layout, input_shape, tile_size)
-        schedules.append(cur_sch)
-
-    # Also include the default schedules which will be handled via AutoBind schedule rule
-    schedules.append(sch)
-        
-    return schedules
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 2b29f5c28954..d2a189093292 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -21,8 +21,7 @@
 
 from tvm import _ffi, ir, te, topi
 from tvm.target import generic_func, override_native_generic_func
-from tvm.topi.utils import (get_const_float, get_const_int, get_const_tuple,
-                            get_float_tuple)
+from tvm.topi.utils import get_const_float, get_const_int, get_const_tuple, get_float_tuple
 
 from .. import op as _op
 
@@ -2071,7 +2070,7 @@ def layout_transform_strategy(attrs, inputs, out_type, target):
 
 
 def wrap_compute_layout_transform(topi_compute, schedule_rule=""):
-    """Wrap stft compute"""
+    """Wrap layout transform compute"""
 
     def _compute_layout_transform(attrs, inputs, output_type):
         return [
diff --git a/python/tvm/topi/cuda/batch_matmul.py b/python/tvm/topi/cuda/batch_matmul.py
index 829d2d68940d..d2f5c9b9c586 100644
--- a/python/tvm/topi/cuda/batch_matmul.py
+++ b/python/tvm/topi/cuda/batch_matmul.py
@@ -17,12 +17,12 @@
 # pylint: disable=invalid-name,too-many-locals,unused-variable,unused-argument
 """cuda batch_matmul operators"""
 import tvm
-from tvm import autotvm, te
-from tvm.autotvm.task.space import OtherOptionEntity, SplitEntity
+from tvm import autotvm
+from tvm import te
 from tvm.contrib import cublas
-
-from .. import generic, nn
-from ..utils import get_const_tuple, get_max_power2_factor, traverse_inline
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
+from .. import nn, generic
+from ..utils import traverse_inline, get_const_tuple, get_max_power2_factor
 from .tensor_intrin import dp4a
 
 
@@ -87,7 +87,6 @@ def schedule_batch_matmul(cfg, outs):
     s: Schedule
         The computation schedule for the op.
     """
-    breakpoint()
     outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
     s = te.create_schedule([x.op for x in outs])
 
diff --git a/python/tvm/topi/x86/new.py b/python/tvm/topi/x86/new.py
deleted file mode 100644
index e611b5a84a51..000000000000
--- a/python/tvm/topi/x86/new.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import tensorflow as tf
-
-indices = tf.constant([[1], [1]])
-updates = tf.constant(
-    [
-        [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-        [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-    ]
-)
-shape = tf.constant([4, 4, 4])
-scatter = tf.scatter_nd(indices, updates, shape)
-print(scatter)
diff --git a/src/tir/schedule/primitive/compute_at.cc b/src/tir/schedule/primitive/compute_at.cc
index e027cfdd5742..988c73c3f071 100644
--- a/src/tir/schedule/primitive/compute_at.cc
+++ b/src/tir/schedule/primitive/compute_at.cc
@@ -706,11 +706,6 @@ void ComputeAtOrReverseComputeAtImpl(ScheduleState self, const StmtSRef& block_s
                               /*provided_regions=*/std::move(provided_regions),
                               /*required_regions=*/std::move(required_regions),
                               /*analyzer=*/analyzer);
-
-  for (int i = 0; i < iter_doms.size(); i++) {
-    //LOG(WARNING) << i << " " << iter_doms[i].dom << "  " << iter_doms[i].bound << std::endl;
-  }
-
   // Step 6. Create the new scope according to the iteration domain
   reconstructor.MakeNewLoop(/*insert_position=*/insert_position, /*iter_doms=*/std::move(iter_doms),
                             /*analyzer=*/analyzer, /*preserve_unit_loops=*/preserve_unit_loops);

From ced49f4d1019c415e9ad779dcb531eec69330249 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Thu, 2 Mar 2023 17:25:56 -0800
Subject: [PATCH 05/40] forgot to forward arg

---
 python/tvm/meta_schedule/schedule/cuda/layout_transform.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
index e064121e8514..47413b161619 100644
--- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
+++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
@@ -269,7 +269,7 @@ def get_high_level_loop_structure(block):
     sch.bind(loop=inner_write_loop, thread_axis="threadIdx.x")
     sch.bind(loop=inner_read_loop, thread_axis="threadIdx.x")
 
-def auto_inline(start_block):
+def auto_inline(sch, start_block):
     # Autoinlines given block into consumers, and repeats process for consumer of block
     # Done by default for injective schedules.
     fringe = deque([start_block])
@@ -305,7 +305,7 @@ def cuda_layout_transform_schedule_rule(sch, block):
 
     # For each schedule we also want to inline each stage as would be done in normal circumstances
     # to prevent extraneous memory access.
-    block = auto_inline(block)
+    block = auto_inline(sch, block)
 
     schedules = []
 

From c6e87390a117f0e1c21ae9ce3a5bd662f2aa2271 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Fri, 3 Mar 2023 13:09:24 -0800
Subject: [PATCH 06/40] fix tests

---
 python/tvm/meta_schedule/schedule/cuda/layout_transform.py | 6 +++---
 python/tvm/relay/op/strategy/generic.py                    | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
index 47413b161619..d2e11994f5eb 100644
--- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
+++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
@@ -309,13 +309,13 @@ def cuda_layout_transform_schedule_rule(sch, block):
 
     schedules = []
 
+    # Always include the default schedules which will be handled via AutoBind schedule rule
+    schedules.append(sch)
+
     # Tile size 2,3,4...64 as tile size of 1 has no coaslescing.
     for tile_size in range(2, 65):
         cur_sch = sch.copy()
         tile_layout_transform(cur_sch, block, src_layout, dst_layout, input_shape, tile_size)
         schedules.append(cur_sch)
 
-    # Also include the default schedules which will be handled via AutoBind schedule rule
-    schedules.append(sch)
-
     return schedules
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index d2a189093292..d53cd045383b 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -2063,7 +2063,8 @@ def layout_transform_strategy(attrs, inputs, out_type, target):
     strategy = _op.OpStrategy()
     strategy.add_implementation(
         wrap_compute_layout_transform(topi.layout_transform),
-        wrap_topi_schedule(topi.generic.schedule_injective),
+        # Defined earlier in the file
+        schedule_injective,
         name="layout_transform.generic",
     )
     return strategy

From f18f933180d63f76221ed4d6e5e634362107b830 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Fri, 3 Mar 2023 13:15:59 -0800
Subject: [PATCH 07/40] reduce search space

---
 python/tvm/meta_schedule/schedule/cuda/layout_transform.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
index d2e11994f5eb..ac881059f6e2 100644
--- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
+++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
@@ -312,8 +312,8 @@ def cuda_layout_transform_schedule_rule(sch, block):
     # Always include the default schedules which will be handled via AutoBind schedule rule
     schedules.append(sch)
 
-    # Tile size 2,3,4...64 as tile size of 1 has no coaslescing.
-    for tile_size in range(2, 65):
+    # Tile size 2,3,4...32 as tile size of 1 has no coaslescing.
+    for tile_size in range(2, 33):
         cur_sch = sch.copy()
         tile_layout_transform(cur_sch, block, src_layout, dst_layout, input_shape, tile_size)
         schedules.append(cur_sch)

From 5a51ffa078402cc3f938b4c34a546df127d9e1e1 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Fri, 3 Mar 2023 13:22:50 -0800
Subject: [PATCH 08/40] lint

---
 .../schedule/cuda/layout_transform.py         | 29 ++++++++++---------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
index ac881059f6e2..6d9545474a12 100644
--- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
+++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
@@ -18,7 +18,7 @@ def tile_layout_transform(
     """
     High level tiling for layout transform block.
     """
-    
+
     ## Tiling layout transforms:
     # Assume we have an input shape of [A, B, C, D] and want to layout transform
     # ABCD --> DBAC so the output shape would be [D, B, A, C].
@@ -31,15 +31,15 @@ def tile_layout_transform(
     # lDw, lBw, lAw, lCw = T.grid(D, B, A, C)
     #
     # Clearly in many scenarios it is impossible to guarantee contiguous writes and reads
-    # within a single loop. Due to non-adjacent dimensions. Instead we work on transposing some 
-    # small sub-tensor of our input writing and then reading from shared memory. We must now 
-    # construct our submatrix so that reading and writing can both be done with some contiguous  
+    # within a single loop. Due to non-adjacent dimensions. Instead we work on transposing some
+    # small sub-tensor of our input writing and then reading from shared memory. We must now
+    # construct our submatrix so that reading and writing can both be done with some contiguous
     # access in global memory.
     #
     # Consider the case of a 2D transpose. For example [1024, 2048] -> [2048, 1024].
-    # We note that if we deal with a submatrix of shape [32, 32] which corresponds 
+    # We note that if we deal with a submatrix of shape [32, 32] which corresponds
     # to the dimension of our input tensor, then rows of the submatrix are contiguous
-    # in the input tensor. Meanwhile, columns of our submatrix are contiguous in our 
+    # in the input tensor. Meanwhile, columns of our submatrix are contiguous in our
     # output vector. Therefore, with this tile shape we have opportunity to read
     # contiguously in our input tensor and write to shared memory, and write contiguously
     # to our output tensor.
@@ -48,13 +48,13 @@ def tile_layout_transform(
     # memory per block of [`tile_size`, `tile_size`]. We want the inner most dimension
     # of our shared memory to correspond to contiguous reads from the input tensor and
     # the outer dimension to correspond to contiguous writes into the output tensor.
-    # 
+    #
     # In terms of the loop structure reading from the input tensor, the inner most loops
-    # of our tile must correspond to the inner most dimensions of the input shape,  
+    # of our tile must correspond to the inner most dimensions of the input shape,
     # while the outer dimensions correspond to the inner most dimensions of the output shape.
     # To obtain an inner tile with this loop structure we factor out a contiguous `tile_size`
-    # chunk of our loop in the shape of interest. 
-    # 
+    # chunk of our loop in the shape of interest.
+    #
     # An example is probably best to show this idea:
     # Let's say we want a layout transform of ABCD --> DCAB. With shape
     # [1024_a, 2_b, 32_c, 8_d] --> [8_d, 32_c, 1024_a, 2_b]
@@ -63,12 +63,12 @@ def tile_layout_transform(
     #
     # Then we initially have a coalesced-read loop pattern of:
     # T.grid(1024_a, 2_b, 32_c, 8_d)
-    # 
+    #
     # To obtain an inner tile of 32, we factor 4 from 32_c and 8 from 8_d:
     # T.grid(1024_a, 2_b, 8_c1, 1_d1, 4_c2t, 8_d2t)
     # T.grid(1024_a, 2_b, 8_cr, 1_dr, 32_dim1)
     #
-    # To obtain an outer tile of 32, we factor from B then A to follow contiguous write 
+    # To obtain an outer tile of 32, we factor from B then A to follow contiguous write
     # pattern:
     #
     # T.grid(64_a1, 1_b1, 8_cr, 1_dr, 16_a2t, 2_b2t, 32_dim1)
@@ -165,7 +165,7 @@ def factor_dim_in_order(
         work_needed_inner_loop: int = tile_size,
     ):
         """Factors out the loops in the order of indices until we reach needed work.
-        
+
         Adds new loop factors to the back in reverse order of access.
         """
         for i in indices:
@@ -269,6 +269,7 @@ def get_high_level_loop_structure(block):
     sch.bind(loop=inner_write_loop, thread_axis="threadIdx.x")
     sch.bind(loop=inner_read_loop, thread_axis="threadIdx.x")
 
+
 def auto_inline(sch, start_block):
     # Autoinlines given block into consumers, and repeats process for consumer of block
     # Done by default for injective schedules.
@@ -296,7 +297,7 @@ def cuda_layout_transform_schedule_rule(sch, block):
     params = sch.mod["main"].params
     input_buffer = sch.mod["main"].buffer_map[params[0]]
     output_buffer = sch.mod["main"].buffer_map[params[1]]
-    
+
     # Info needed for tiling
     input_shape = [int(dim) for dim in input_buffer.shape]
     output_shape = [int(dim) for dim in output_buffer.shape]

From e1ce901018cb2303e8406ce0da6e7f490da0c037 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Wed, 8 Mar 2023 13:42:32 -0800
Subject: [PATCH 09/40] schedule rule documentation

---
 include/tvm/topi/transform.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index 0d0c5d5962ce..ff57f99075ff 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -1592,6 +1592,7 @@ inline Array<Tensor> meshgrid(const Array<Tensor>& inputs, const std::string& in
  * \param dst_layout the destination layout.
  * \param name output tensor name.
  * \param tag output tensor tag.
+ * \param schedule_rule name of specialized schedule rule to use.
  * \return A tensor with shape in \p dst_layout
  */
 inline Tensor layout_transform(const Tensor& src, const std::string& src_layout,

From 5f0b1b00f98b9359a8ba06e566d869f931ba0284 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Wed, 8 Mar 2023 15:02:46 -0800
Subject: [PATCH 10/40] add a note

---
 .../postproc/rewrite_cooperative_fetch.cc            | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
index 353b90c36423..f0e7e4eccfa1 100644
--- a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
+++ b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
@@ -39,7 +39,17 @@ Optional<Integer> ParseThreadBinding(const Schedule& sch, const Instruction& ins
   if (thread_axis != axis) {
     return NullOpt;
   }
-  return Downcast<Integer>(sch->Get(Downcast<LoopRV>(inst->inputs[0]))->extent);
+
+  try {
+    return Downcast<Integer>(sch->Get(Downcast<LoopRV>(inst->inputs[0]))->extent);
+  } catch (const std::exception& e) {
+    // This can occur if in a schedule we manually bind threads in the middle of a schedule
+    // and then later modify the schedule. As the passed in schedule is after running the entire trace
+    // the bound loop may be moved around in the IRModule.
+    // TODO: apply trace one inst at a time in the schedule so schedule state is always accurate to instruction
+    LOG(WARNING) << "Failed to calculate extent so skipping RewriteCooperativeFetching. Error " << e.what();
+    return NullOpt;
+  }
 }
 
 /*!

From 826a8772a7f1933c5c260989a5624a25e4e5f17a Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Wed, 8 Mar 2023 15:08:43 -0800
Subject: [PATCH 11/40] fix wording

---
 src/meta_schedule/postproc/rewrite_cooperative_fetch.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
index f0e7e4eccfa1..7e314a50a97b 100644
--- a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
+++ b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
@@ -46,7 +46,7 @@ Optional<Integer> ParseThreadBinding(const Schedule& sch, const Instruction& ins
     // This can occur if in a schedule we manually bind threads in the middle of a schedule
     // and then later modify the schedule. As the passed in schedule is after running the entire trace
     // the bound loop may be moved around in the IRModule.
-    // TODO: apply trace one inst at a time in the schedule so schedule state is always accurate to instruction
+    // TODO: apply trace one inst at a time so schedule state is always accurate to instruction
     LOG(WARNING) << "Failed to calculate extent so skipping RewriteCooperativeFetching. Error " << e.what();
     return NullOpt;
   }

From d11e66e7599a133690333901b939ac7a20c8413b Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Mon, 13 Mar 2023 13:54:48 -0700
Subject: [PATCH 12/40] handle implicit reshape case v1

---
 .../schedule/cuda/layout_transform.py         | 117 ++++++++++++++++--
 1 file changed, 106 insertions(+), 11 deletions(-)

diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
index 6d9545474a12..84bb696bb088 100644
--- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
+++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
@@ -1,10 +1,10 @@
-import tvm
-from tvm import topi
 import math
-from typing import List, Sequence, Tuple
+from collections import deque
+from typing import List, Sequence, Tuple, Union
 
+import tvm
+from tvm import topi
 from tvm.tir.schedule import BlockRV, ExprRV, LoopRV
-from collections import deque
 
 
 def tile_layout_transform(
@@ -176,11 +176,88 @@ def factor_dim_in_order(
                 break
         return loops, cur_loop_extants
 
-    def get_high_level_loop_structure(block):
+    # Layout transform allows semantics like NCHW --> NCHW4c
+    # Which involves splitting the original C axis into contiguous 4-element chunks
+    # This axis is then moved to the end (NCHWc)
+    # To handle this we just split the associating axis (prev. type checking ensures C is divisible by 4)
+    # And then the layout is just NCcHW --> NCHW4c
+    # input_shape, src_layout, dst_layout = handle_split_case(sch)
+    # Note: NCHW4c --> NCHW is not allowed, so the only numeric digits will be in dst
+    def handle_block_implicit_reshape(
+        block_read, orig_input_shape, orig_src_layout, orig_dst_layout
+    ) -> Tuple[List[int], str, str]:
+        # Each loop should match src_layout dimension extants
+        loops = sch.get_loops(block_read)
+
+        # Figure out split dimensions, entries are (loop index in src_layout, split dimension)
+        split_dimensions: List[Tuple[int, int]] = []
+
+        # This is without numeric digits, e.g. NCHW4c -> NCHWc
+        new_dst_layout = []
+
+        # Use state machine to parse NCHW4c string
+        split_size = 0
+        for char in orig_dst_layout:
+            if char.isnumeric():
+                split_size = split_size * 10 + int(char)
+            else:
+                if char.islower():
+                    # hit axis like 'c', need to find parent axis 'C' in src_layout
+                    src_layout_index = orig_src_layout.index(char.upper())
+                    split_dimensions.append((src_layout_index, split_size))
+                split_size = 0
+                new_dst_layout.append(char)
+
+        # Calculate final input shapes, each of these are a single element for unsplit dims
+        # and tuples for split dims associated with the two new axis
+        input_shape: List[Union[int, Tuple]] = [i for i in orig_input_shape]
+        new_src_layout: List[Union[str, Tuple]] = [c for c in orig_src_layout]
+        for src_layout_split_index, split_factor in split_dimensions:
+            dimension_name = new_src_layout[src_layout_split_index]
+            new_src_layout[src_layout_split_index] = (dimension_name, dimension_name.lower())
+
+            remain_factor = input_shape[src_layout_split_index] // split_factor
+            input_shape[src_layout_split_index] = (remain_factor, split_factor)
+
+            sch.split(loops[src_layout_split_index], [remain_factor, split_factor])
+
+        # Finally to help analyzer, make layout match that of output
+        def index_map(*loop_indices):
+            answer = []
+            assert len(loop_indices) == len(input_shape)
+            for dim, input_shape_solved in zip(loop_indices, input_shape):
+                if isinstance(input_shape_solved, tuple):
+                    (_, split_factor) = input_shape_solved
+                    answer.extend((dim // split_factor, dim % split_factor))
+                else:
+                    answer.append(dim)
+            return tuple(answer)
+
+        sch.transform_layout(
+            block_read, buffer=("write", 0), index_map=index_map, assume_injective_transform=True
+        )
+
+        # Unpack any tuples introduced via appending
+        def unpack_list(target_list) -> list:
+            output = []
+            for ele in target_list:
+                if isinstance(ele, tuple):
+                    output.extend(ele)
+                else:
+                    output.append(ele)
+            return output
+
+        new_src_layout = unpack_list(new_src_layout)
+        new_src_layout = "".join(new_src_layout)
+        new_dst_layout = "".join(new_dst_layout)
+        return unpack_list(input_shape), new_src_layout, new_dst_layout
+
+    def get_high_level_loop_structure(block_read, input_shape, src_layout, dst_layout):
         """Runs the factorization described above."""
         # index 0 ... rank - 1 will always correspond to original loops
         # perhaps after they have been factored.
-        loops = sch.get_loops(block)
+        rank = len(input_shape)
+        loops = sch.get_loops(block_read)
         cur_loop_extants = list(input_shape)
 
         # Factor dim0 tile size and fuse things together
@@ -223,8 +300,6 @@ def get_high_level_loop_structure(block):
         cur_loop_extants = cur_loop_extants[: rank + 1]
         cur_loop_extants.append(tile_size)
 
-    rank = len(src_layout)
-
     # Outer loop structure of read block matches that of src_layout
     # E.g. if input_shape is [4, 6, 8]. Loops for read block will be
     # for i, j, k in T.grid(4, 6, 8):
@@ -233,8 +308,13 @@ def get_high_level_loop_structure(block):
     # Assume write to output global memory is coalesced in block_write
     block_read = sch.cache_read(block_write, 0, "shared")
 
-    # Here we have [loop1, loop2, loop3 ... dim0_tiled, dim1_tiled]
-    get_high_level_loop_structure(block_read)
+    # Grab final input shape and src and dst layouts.
+    input_shape, src_layout, dst_layout = handle_block_implicit_reshape(
+        block_read, input_shape, src_layout, dst_layout
+    )
+
+    # After this we have [loop1, loop2, loop3 ... dim0_tiled, dim1_tiled]
+    get_high_level_loop_structure(block_read, input_shape, src_layout, dst_layout)
     loops = sch.get_loops(block_read)
 
     # If there are insufficient elements, than dim1_tiled or dim0_tiled might be too small
@@ -291,6 +371,18 @@ def auto_inline(sch, start_block):
             return cur_block
 
 
+def handle_split_case(sch, input_shape, src_layout, dst_layout):
+    """
+    Handle the case where the layout transform also reshapes some axis
+    e.g. NCHW --> NCHW4c
+    (split the original C axis into chunks of 4 contiguous elements, move this axis to end)
+
+    Rewrites the schedule to handle this case and returns the input shape as well as
+    src/dst layouts to use for the later code.
+    """
+    pass
+
+
 @tvm.register_func("meta_schedule.cuda.layout_transform")
 def cuda_layout_transform_schedule_rule(sch, block):
     # params: input_buffer, output_buffer
@@ -313,7 +405,10 @@ def cuda_layout_transform_schedule_rule(sch, block):
     # Always include the default schedules which will be handled via AutoBind schedule rule
     schedules.append(sch)
 
-    # Tile size 2,3,4...32 as tile size of 1 has no coaslescing.
+    # Setup up basic structure of schedule before applying tiling
+    sch = sch.copy()
+
+    # Try tile size 2,3,4...32 as tile size of 1 has no coaslescing.
     for tile_size in range(2, 33):
         cur_sch = sch.copy()
         tile_layout_transform(cur_sch, block, src_layout, dst_layout, input_shape, tile_size)

From 0be1be0c2cafc02f83ccdc701e037ca62975b74f Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Mon, 13 Mar 2023 14:52:28 -0700
Subject: [PATCH 13/40] clean up comments

---
 .../schedule/cuda/layout_transform.py         | 480 +++++++++++-------
 1 file changed, 291 insertions(+), 189 deletions(-)

diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
index 84bb696bb088..c3166abea85c 100644
--- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
+++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
@@ -6,9 +6,68 @@
 from tvm import topi
 from tvm.tir.schedule import BlockRV, ExprRV, LoopRV
 
+## Tiling layout transforms:
+# Assume we have an input shape of [A, B, C, D] and want to layout transform
+# ABCD --> DBAC so the output shape would be [D, B, A, C].
+#
+# Consider reading from the input buffer in a cache-friendly fashion on CPU. We would
+# expect a loop structure like:
+# lAr, lBr, lCr, lDr = T.grid(A, B, C, D)
+#
+# Meanwhile consider writing to the output buffer in a cache-friendly fashion on CPU:
+# lDw, lBw, lAw, lCw = T.grid(D, B, A, C)
+#
+# Clearly in many scenarios it is impossible to guarantee contiguous writes and reads
+# within a single loop due to non-adjacent dimensions. Instead we work on transposing some
+# small sub-tensor of our input writing and then reading from shared memory. We must now
+# construct our submatrix so that reading and writing can both be done with some contiguous
+# access in global memory.
+#
+# Consider the case of a 2D transpose. For example [1024, 2048] -> [2048, 1024].
+# We note that if we deal with a submatrix of shape [32, 32] which corresponds
+# to the dimension of our input tensor, then rows of the submatrix are contiguous
+# in the input tensor. Meanwhile, columns of our submatrix are contiguous in our
+# output vector. Therefore, with this tile shape we have opportunity to read
+# contiguously in our input tensor and write to shared memory, and write contiguously
+# to our output tensor.
+#
+# The multiple dimensional case has a similar analogue. We want to allocate shared
+# memory per block of [`tile_size`, `tile_size`]. We want the inner most dimension
+# of our shared memory to correspond to contiguous reads from the input tensor and
+# the outer dimension to correspond to contiguous writes into the output tensor.
+#
+# In terms of the loop structure reading from the input tensor, the inner most loops
+# of our tile must correspond to the inner most dimensions of the input shape,
+# while the outer dimensions correspond to the inner most dimensions of the output shape.
+# To obtain an inner tile with this loop structure we factor out a contiguous `tile_size`
+# chunk of our loop in the shape of interest.
+#
+# An example is probably best to show this idea:
+# Let's say we want a layout transform of ABCD --> DCAB. With shape
+# [1024_a, 2_b, 32_c, 8_d] --> [8_d, 32_c, 1024_a, 2_b]
+#
+# And tile size 32.
+#
+# Then we initially have a coalesced-read loop pattern of:
+# T.grid(1024_a, 2_b, 32_c, 8_d)
+#
+# To obtain an inner tile of 32, we factor 4 from 32_c and 8 from 8_d:
+# T.grid(1024_a, 2_b, 8_c1, 1_d1, 4_c2t, 8_d2t)
+# T.grid(1024_a, 2_b, 8_cr, 1_dr, 32_dim1)
+#
+# To obtain an outer tile of 32, we factor from B then A to follow contiguous write
+# pattern:
+#
+# T.grid(64_a1, 1_b1, 8_cr, 1_dr, 16_a2t, 2_b2t, 32_dim1)
+# T.grid(64_ar, 1_br, 8_cr, 1_dr, 32_dim0, 32_dim1)
+#
+# Which allows us to read a tile with our wanted properties.
+# For writing we use the existing analysis infrastructure to generate the proper structure for writing.
+
 
 def tile_layout_transform(
     sch: tvm.tir.Schedule,
+    block_read: BlockRV,
     block_write: BlockRV,
     src_layout: str,
     dst_layout: str,
@@ -16,66 +75,38 @@ def tile_layout_transform(
     tile_size: ExprRV,
 ):
     """
-    High level tiling for layout transform block.
-    """
+    High level tiling for layout transform block. Mutates sch in place.
+
+    Parameters
+    ----------
+    sch:
+        The initial schedule. We expect `block_read` and `block_write` to correspond to
+        the blocks which reads and writes from global memory respectively. We also expect
+        block_read's initial loops to follow
+
+    block_read:
+        The block which reads from global memory and writes to shared memory buffer.
+
+    block_write:
+        The block which writes to global memory and reads from shared memory buffer.
+
+    src_layout :
+        The src_layout, each character should appear once and also appear in dst_layout.
+        There should be not numeric characters and refer to potentially implicit reshapes.
+        E.g. the transform NCHW --> NCHW4c really implies NCcHW --> NCHWc. In this case
+        src_layout should be NCcHW.
 
-    ## Tiling layout transforms:
-    # Assume we have an input shape of [A, B, C, D] and want to layout transform
-    # ABCD --> DBAC so the output shape would be [D, B, A, C].
-    #
-    # Consider reading from the input buffer in a cache-friendly fashion on CPU. We would
-    # expect a loop structure like:
-    # lAr, lBr, lCr, lDr = T.grid(A, B, C, D)
-    #
-    # Meanwhile consider writing to the output buffer in a cache-friendly fashion on CPU:
-    # lDw, lBw, lAw, lCw = T.grid(D, B, A, C)
-    #
-    # Clearly in many scenarios it is impossible to guarantee contiguous writes and reads
-    # within a single loop. Due to non-adjacent dimensions. Instead we work on transposing some
-    # small sub-tensor of our input writing and then reading from shared memory. We must now
-    # construct our submatrix so that reading and writing can both be done with some contiguous
-    # access in global memory.
-    #
-    # Consider the case of a 2D transpose. For example [1024, 2048] -> [2048, 1024].
-    # We note that if we deal with a submatrix of shape [32, 32] which corresponds
-    # to the dimension of our input tensor, then rows of the submatrix are contiguous
-    # in the input tensor. Meanwhile, columns of our submatrix are contiguous in our
-    # output vector. Therefore, with this tile shape we have opportunity to read
-    # contiguously in our input tensor and write to shared memory, and write contiguously
-    # to our output tensor.
-    #
-    # The multiple dimensional case has a similar analogue. We want to allocate shared
-    # memory per block of [`tile_size`, `tile_size`]. We want the inner most dimension
-    # of our shared memory to correspond to contiguous reads from the input tensor and
-    # the outer dimension to correspond to contiguous writes into the output tensor.
-    #
-    # In terms of the loop structure reading from the input tensor, the inner most loops
-    # of our tile must correspond to the inner most dimensions of the input shape,
-    # while the outer dimensions correspond to the inner most dimensions of the output shape.
-    # To obtain an inner tile with this loop structure we factor out a contiguous `tile_size`
-    # chunk of our loop in the shape of interest.
-    #
-    # An example is probably best to show this idea:
-    # Let's say we want a layout transform of ABCD --> DCAB. With shape
-    # [1024_a, 2_b, 32_c, 8_d] --> [8_d, 32_c, 1024_a, 2_b]
-    #
-    # And tile size 32.
-    #
-    # Then we initially have a coalesced-read loop pattern of:
-    # T.grid(1024_a, 2_b, 32_c, 8_d)
-    #
-    # To obtain an inner tile of 32, we factor 4 from 32_c and 8 from 8_d:
-    # T.grid(1024_a, 2_b, 8_c1, 1_d1, 4_c2t, 8_d2t)
-    # T.grid(1024_a, 2_b, 8_cr, 1_dr, 32_dim1)
-    #
-    # To obtain an outer tile of 32, we factor from B then A to follow contiguous write
-    # pattern:
-    #
-    # T.grid(64_a1, 1_b1, 8_cr, 1_dr, 16_a2t, 2_b2t, 32_dim1)
-    # T.grid(64_ar, 1_br, 8_cr, 1_dr, 32_dim0, 32_dim1)
-    #
-    # Which allows us to read a tile with our wanted properties.
-    # For writing we use the existing analysis infrastructure to generate the proper structure for writing.
+    dst_layout:
+        The dst_layout. There should not be numeric characters, e.g. NCHW4c becomes NCHWc.
+
+    input_shape:
+        The input shape after applying potentially implicit reshapes. Should match the loop
+        extants corresponding to src_layout.
+
+    tile_size:
+        The tile size of read and writes. There will be tile_size threads per block, each of which
+        reads up to tile_size elements.
+    """
 
     def pad_dimension_to_at_least_number(loop: LoopRV, requested_size: int):
         """E.g. if loop has extant of 8 but we want 10, returns size 10 loop with padding."""
@@ -86,15 +117,16 @@ def pad_dimension_to_factor_of_tile_size(
         loop: LoopRV, initial_size: int, tile_size: int = tile_size
     ) -> Tuple[LoopRV, int]:
         """
-        Pads loop of given size until it is divisble into tile_size.
+        Pads loop of given size until it is divisible into tile_size.
         If the given size of the loop is greater than tile size. Do not pad.
 
-        example, loop_size = 5, tile_size = 32. loop_size --> 8
-                loop_size = 5, tile_size = 36. loop_size --> 6
-                loop_size = 8, tile_size = 32. loop_size --> 8
-                loop_size = 33, tile_size = 32. loop_size --> 33
+        examples:
+        - loop_size = 5 , tile_size = 32. loop_size --> 8
+        - loop_size = 5 , tile_size = 36. loop_size --> 6
+        - loop_size = 8 , tile_size = 32. loop_size --> 8  : since 8 already divides 32.
+        - loop_size = 33, tile_size = 32. loop_size --> 33 : since 33 > 32.
 
-        Returns padded loopRV and the new size
+        Returns padded loopRV and the new size.
         """
         if tile_size % initial_size == 0:
             return loop, int(initial_size)
@@ -113,22 +145,25 @@ def spin_out_factor(
         loops: List[LoopRV], loop_extants: List[int], index: int, factor_needed: int
     ) -> Tuple[List[LoopRV], List[int], int]:
         """
-        Factor out loop dimensions to reach the requested factor. Updates the schedule in-place.
+        Factor out the requested loop's dimensions to reach the requested factor and
+        places the requested factor as the innermost loop.
+
+        Updates the schedule in-place.
 
         E.g. say we want to factors which eventually multiply to 32 (factor_needed).
 
         Say we have the index we chose is a loop with an extant of 8.
-        E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed = 32, index = 3
+        E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed = 32, index=3 (dim=8)
             - 8 divides into 32 so we just split up the loop into two loops with extants 1 and 8.
             - we then keep the 1-loop in place and move the new 8-loop to back of the list of loops
             - ending loops / loop_extants = [3, 32, 6, 1, 8], remaining_factor_needed = 32 / 8 = 4
 
-        E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed=32, index = 0
+        E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed=32, index=0 (dim=3)
             - 3 does not divide 32, so we pad until the extant divides 32, e.g. 4
             - we then split up the loop into extants 1 and 4, moving the 4 to the back
             - ending loops / loop_extants = [1, 32, 6, 8, 4], remaining_factor_needed = 32 / 4 = 8
 
-        E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed=5, index = 3
+        E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed=5, index=3 (dim=8)
             - 8 is larger than 5 so we immediately do the splitting routine.
             - the 8 extant loop becomes loops with extants 2 and 5
             - ending loops / loop_extants = [1, 32, 6, 2, 5], remaining_factor_needed = 5 / 5 = 1
@@ -163,10 +198,11 @@ def factor_dim_in_order(
         loops: List[LoopRV],
         cur_loop_extants: List[int],
         work_needed_inner_loop: int = tile_size,
-    ):
+    ) -> Tuple[List[LoopRV], Sequence[int]]:
         """Factors out the loops in the order of indices until we reach needed work.
 
-        Adds new loop factors to the back in reverse order of access.
+        Adds new loop factors to the back in reverse order of access. Returns new list
+        of loops and their extants.
         """
         for i in indices:
             loops, cur_loop_extants, work_needed_inner_loop = spin_out_factor(
@@ -176,83 +212,9 @@ def factor_dim_in_order(
                 break
         return loops, cur_loop_extants
 
-    # Layout transform allows semantics like NCHW --> NCHW4c
-    # Which involves splitting the original C axis into contiguous 4-element chunks
-    # This axis is then moved to the end (NCHWc)
-    # To handle this we just split the associating axis (prev. type checking ensures C is divisible by 4)
-    # And then the layout is just NCcHW --> NCHW4c
-    # input_shape, src_layout, dst_layout = handle_split_case(sch)
-    # Note: NCHW4c --> NCHW is not allowed, so the only numeric digits will be in dst
-    def handle_block_implicit_reshape(
-        block_read, orig_input_shape, orig_src_layout, orig_dst_layout
-    ) -> Tuple[List[int], str, str]:
-        # Each loop should match src_layout dimension extants
-        loops = sch.get_loops(block_read)
-
-        # Figure out split dimensions, entries are (loop index in src_layout, split dimension)
-        split_dimensions: List[Tuple[int, int]] = []
-
-        # This is without numeric digits, e.g. NCHW4c -> NCHWc
-        new_dst_layout = []
-
-        # Use state machine to parse NCHW4c string
-        split_size = 0
-        for char in orig_dst_layout:
-            if char.isnumeric():
-                split_size = split_size * 10 + int(char)
-            else:
-                if char.islower():
-                    # hit axis like 'c', need to find parent axis 'C' in src_layout
-                    src_layout_index = orig_src_layout.index(char.upper())
-                    split_dimensions.append((src_layout_index, split_size))
-                split_size = 0
-                new_dst_layout.append(char)
-
-        # Calculate final input shapes, each of these are a single element for unsplit dims
-        # and tuples for split dims associated with the two new axis
-        input_shape: List[Union[int, Tuple]] = [i for i in orig_input_shape]
-        new_src_layout: List[Union[str, Tuple]] = [c for c in orig_src_layout]
-        for src_layout_split_index, split_factor in split_dimensions:
-            dimension_name = new_src_layout[src_layout_split_index]
-            new_src_layout[src_layout_split_index] = (dimension_name, dimension_name.lower())
-
-            remain_factor = input_shape[src_layout_split_index] // split_factor
-            input_shape[src_layout_split_index] = (remain_factor, split_factor)
-
-            sch.split(loops[src_layout_split_index], [remain_factor, split_factor])
-
-        # Finally to help analyzer, make layout match that of output
-        def index_map(*loop_indices):
-            answer = []
-            assert len(loop_indices) == len(input_shape)
-            for dim, input_shape_solved in zip(loop_indices, input_shape):
-                if isinstance(input_shape_solved, tuple):
-                    (_, split_factor) = input_shape_solved
-                    answer.extend((dim // split_factor, dim % split_factor))
-                else:
-                    answer.append(dim)
-            return tuple(answer)
-
-        sch.transform_layout(
-            block_read, buffer=("write", 0), index_map=index_map, assume_injective_transform=True
-        )
-
-        # Unpack any tuples introduced via appending
-        def unpack_list(target_list) -> list:
-            output = []
-            for ele in target_list:
-                if isinstance(ele, tuple):
-                    output.extend(ele)
-                else:
-                    output.append(ele)
-            return output
-
-        new_src_layout = unpack_list(new_src_layout)
-        new_src_layout = "".join(new_src_layout)
-        new_dst_layout = "".join(new_dst_layout)
-        return unpack_list(input_shape), new_src_layout, new_dst_layout
-
-    def get_high_level_loop_structure(block_read, input_shape, src_layout, dst_layout):
+    def get_high_level_loop_structure(
+        block_read: BlockRV, input_shape: Sequence[int], src_layout: str, dst_layout: str
+    ):
         """Runs the factorization described above."""
         # index 0 ... rank - 1 will always correspond to original loops
         # perhaps after they have been factored.
@@ -270,7 +232,7 @@ def get_high_level_loop_structure(block_read, input_shape, src_layout, dst_layou
         # The factors which multiply to tile_size are now in back of our
         # list of loops. However because we added them by traversing the inner
         # dimensions, they are actually reversed order to guarantee the best access
-        # so reorder so reorder before fusing.
+        # so reorder before fusing.
         loops = loops[:rank] + loops[rank:][::-1]
         cur_loop_extants = cur_loop_extants[:rank] + cur_loop_extants[rank::-1]
         sch.reorder(*loops)
@@ -300,25 +262,12 @@ def get_high_level_loop_structure(block_read, input_shape, src_layout, dst_layou
         cur_loop_extants = cur_loop_extants[: rank + 1]
         cur_loop_extants.append(tile_size)
 
-    # Outer loop structure of read block matches that of src_layout
-    # E.g. if input_shape is [4, 6, 8]. Loops for read block will be
-    # for i, j, k in T.grid(4, 6, 8):
-    #     ...
-    # Read block will read from global memory coalesced at the start
-    # Assume write to output global memory is coalesced in block_write
-    block_read = sch.cache_read(block_write, 0, "shared")
-
-    # Grab final input shape and src and dst layouts.
-    input_shape, src_layout, dst_layout = handle_block_implicit_reshape(
-        block_read, input_shape, src_layout, dst_layout
-    )
-
-    # After this we have [loop1, loop2, loop3 ... dim0_tiled, dim1_tiled]
+    # After this we have loops: [loop1, loop2, loop3 ... dim0_tiled, dim1_tiled]
     get_high_level_loop_structure(block_read, input_shape, src_layout, dst_layout)
-    loops = sch.get_loops(block_read)
 
     # If there are insufficient elements, than dim1_tiled or dim0_tiled might be too small
     # In all likelihood you should use a smaller tile, but I don't want things to crash.
+    loops = sch.get_loops(block_read)
     loops[-1] = pad_dimension_to_at_least_number(loops[-1], tile_size)
     loops[-2] = pad_dimension_to_at_least_number(loops[-2], tile_size)
 
@@ -331,7 +280,7 @@ def get_high_level_loop_structure(block_read, input_shape, src_layout, dst_layou
     loops = loops[:-3] + [dim0_loop, loops[-3]] + loops[-2:]
     sch.reorder(*loops)
 
-    # After this: [outer_loop (block binding), dim0_tiled, dim1_tiled]
+    # After this loops are: [outer_loop (block binding), dim0_tiled, dim1_tiled]
     outer_loop = sch.fuse(*loops[:-2])
 
     # Now that we have the high level loop structure, we can use reverse_compute_at magic
@@ -350,9 +299,137 @@ def get_high_level_loop_structure(block_read, input_shape, src_layout, dst_layou
     sch.bind(loop=inner_read_loop, thread_axis="threadIdx.x")
 
 
-def auto_inline(sch, start_block):
-    # Autoinlines given block into consumers, and repeats process for consumer of block
-    # Done by default for injective schedules.
+def handle_block_implicit_reshape(
+    sch: tvm.tir.Schedule,
+    block_read: BlockRV,
+    orig_input_shape: Sequence[int],
+    orig_src_layout: str,
+    orig_dst_layout: str,
+) -> Tuple[List[int], str, str]:
+    """
+    Makes layout transform schedule applicable to implicit reshape case.
+
+    Layout transform allows semantics like NCHW --> NCHW4c. Which involves splitting the original C axis into contiguous
+    4-element chunks. This axis is then moved to the end (NCHWc). This is guaranteed by the operator to be done without
+    additional padding. To handle this we just split the associating axis (prev. type checking ensures C is divisible by 4)
+    in src_layout found in block_read. E.g. NCHW -> NCHW4c now becomes NC4cHW -> NCHW4c.
+
+    Note: NCHW4c --> NCHW is not allowed, so the only numeric digits will be in dst.
+
+    The returned layout strings will be santized and made compatible. E.g. NCHW --> NCHW4c becomes
+    NCcHW --> NCHWc.
+
+    Parameters
+    ----------
+    sch:
+        The initial schedule. We expect `block_read`. We also expect
+        block_read's initial loops to follow the original input shape.
+
+    block_read:
+        The block which reads from global memory and writes to shared memory buffer.
+
+    orig_input_shape:
+        The input shape of the input buffer to the primfunc.
+
+    orig_src_layout:
+        The original src_layout string.
+
+    orig_dst_layout:
+        The original dst_layout string.
+
+    Returns
+    -------
+    ret:
+        A tuple of the new input shape of shared memory buffer, the new src_layout and
+        new dst_layout string.
+    """
+    # Each loop should match src_layout dimension extants
+    loops = sch.get_loops(block_read)
+
+    # Figure out split dimensions, entries are (loop index in src_layout, split dimension)
+    split_dimensions: List[Tuple[int, int]] = []
+
+    # This is without numeric digits, e.g. NCHW4c -> NCHWc
+    new_dst_layout = []
+
+    # Use state machine to parse NCHW4c string
+    split_size = 0
+    for char in orig_dst_layout:
+        if char.isnumeric():
+            split_size = split_size * 10 + int(char)
+        else:
+            if char.islower():
+                # hit axis like 'c', need to find parent axis 'C' in src_layout
+                src_layout_index = orig_src_layout.index(char.upper())
+                split_dimensions.append((src_layout_index, split_size))
+            split_size = 0
+            new_dst_layout.append(char)
+
+    # Calculate final input shapes, each of these are a single element for unsplit dims
+    # and tuples for split dims associated with the two new axis
+    input_shape: List[Union[int, Tuple]] = [i for i in orig_input_shape]
+    new_src_layout: List[Union[str, Tuple]] = [c for c in orig_src_layout]
+    for src_layout_split_index, split_factor in split_dimensions:
+        dimension_name = new_src_layout[src_layout_split_index]
+        new_src_layout[src_layout_split_index] = (dimension_name, dimension_name.lower())
+
+        remain_factor = input_shape[src_layout_split_index] // split_factor
+        input_shape[src_layout_split_index] = (remain_factor, split_factor)
+
+        sch.split(loops[src_layout_split_index], [remain_factor, split_factor])
+
+    # Finally to help analyzer, make layout match that of output
+    def index_map(*loop_indices):
+        answer = []
+        assert len(loop_indices) == len(input_shape)
+        for dim, input_shape_solved in zip(loop_indices, input_shape):
+            if isinstance(input_shape_solved, tuple):
+                (_, split_factor) = input_shape_solved
+                answer.extend((dim // split_factor, dim % split_factor))
+            else:
+                answer.append(dim)
+        return tuple(answer)
+
+    sch.transform_layout(
+        block_read, buffer=("write", 0), index_map=index_map, assume_injective_transform=True
+    )
+
+    # Unpack any tuples introduced via appending
+    def unpack_list(target_list) -> list:
+        output = []
+        for ele in target_list:
+            if isinstance(ele, tuple):
+                output.extend(ele)
+            else:
+                output.append(ele)
+        return output
+
+    new_src_layout = unpack_list(new_src_layout)
+    new_src_layout = "".join(new_src_layout)
+    new_dst_layout = "".join(new_dst_layout)
+    return unpack_list(input_shape), new_src_layout, new_dst_layout
+
+
+def auto_inline(sch: tvm.tir.Schedule, start_block: BlockRV) -> BlockRV:
+    """
+    Autoinlines given block into consumers, and repeats process for consumer of block.
+
+    Done by default for injective schedules but must manually be called in new schedule rule.
+
+    Parameters
+    ----------
+    sch:
+        The initial schedule.
+
+    start_block:
+        The block to inline, should be a block which reads and writes to global memory, doing
+        layout transform.
+
+    Returns
+    -------
+    ret:
+        The new block inlined into it's consumers.
+    """
     fringe = deque([start_block])
     visited = set()
     while len(fringe) > 0:
@@ -371,47 +448,72 @@ def auto_inline(sch, start_block):
             return cur_block
 
 
-def handle_split_case(sch, input_shape, src_layout, dst_layout):
+@tvm.register_func("meta_schedule.cuda.layout_transform")
+def cuda_layout_transform_schedule_rule(
+    sch: tvm.tir.Schedule, block: BlockRV
+) -> List[tvm.tir.Schedule]:
     """
-    Handle the case where the layout transform also reshapes some axis
-    e.g. NCHW --> NCHW4c
-    (split the original C axis into chunks of 4 contiguous elements, move this axis to end)
+    Applies tiling scheme to layout transform task (potentially fused with other injective funcs).
 
-    Rewrites the schedule to handle this case and returns the input shape as well as
-    src/dst layouts to use for the later code.
-    """
-    pass
+    Returned schedules will be the default schedule, as well as tiled versions with tile_size in
+    the range of 2,3...32.
 
+    Parameters
+    ----------
+    sch:
+        The initial schedule.
 
-@tvm.register_func("meta_schedule.cuda.layout_transform")
-def cuda_layout_transform_schedule_rule(sch, block):
+    block:
+        The block corresponding to the layout transform.
+        Should be a block which reads and writes to global memory, doing layout transform.
+
+    Returns
+    -------
+    ret:
+        A list of new schedules to try.
+    """
     # params: input_buffer, output_buffer
     params = sch.mod["main"].params
     input_buffer = sch.mod["main"].buffer_map[params[0]]
-    output_buffer = sch.mod["main"].buffer_map[params[1]]
 
     # Info needed for tiling
     input_shape = [int(dim) for dim in input_buffer.shape]
-    output_shape = [int(dim) for dim in output_buffer.shape]
     src_layout = sch.get_sref(block).stmt.annotations["src_layout"]
     dst_layout = sch.get_sref(block).stmt.annotations["dst_layout"]
 
-    # For each schedule we also want to inline each stage as would be done in normal circumstances
-    # to prevent extraneous memory access.
-    block = auto_inline(sch, block)
-
     schedules = []
 
     # Always include the default schedules which will be handled via AutoBind schedule rule
     schedules.append(sch)
-
-    # Setup up basic structure of schedule before applying tiling
     sch = sch.copy()
 
+    # For each schedule we also want to inline each stage as would be done in normal circumstances
+    # to prevent extraneous memory access.
+    block = auto_inline(sch, block)
+
+    # Setup up basic structure of schedule of creating read into shared mem, before applying tiling
+    # Outer loop structure of read block matches that of src_layout
+    # E.g. if input_shape is [4, 6, 8]. Loops for read block will be
+    # for i, j, k in T.grid(4, 6, 8):
+    #     ...
+    # Read block will read from global memory coalesced at the start
+    # Assume write to output global memory is coalesced in block_write
+    block_read = sch.cache_read(block, 0, "shared")
+
+    # Handle the case where there is an implicit reshape going on.
+    # e.g. NCHW -> NCHW4c which is equivalent to reshaping NCHW
+    # to NCcHW and then applying the new layout where the extant of c is 4.
+    # Grab final input shape and src and dst layouts.
+    input_shape, src_layout, dst_layout = handle_block_implicit_reshape(
+        sch, block_read, input_shape, src_layout, dst_layout
+    )
+
     # Try tile size 2,3,4...32 as tile size of 1 has no coaslescing.
     for tile_size in range(2, 33):
-        cur_sch = sch.copy()
-        tile_layout_transform(cur_sch, block, src_layout, dst_layout, input_shape, tile_size)
-        schedules.append(cur_sch)
+        new_sch = sch.copy()
+        tile_layout_transform(
+            new_sch, block_read, block, src_layout, dst_layout, input_shape, tile_size
+        )
+        schedules.append(new_sch)
 
     return schedules

From f2f5165611ff1d85faac04e8138a568096da761b Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Mon, 13 Mar 2023 15:20:34 -0700
Subject: [PATCH 14/40] address comments

---
 .../schedule/cuda/layout_transform.py         | 23 ++++++++++++++++---
 .../postproc/rewrite_cooperative_fetch.cc     |  2 +-
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
index c3166abea85c..fe66e49752a5 100644
--- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
+++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
@@ -448,6 +448,19 @@ def auto_inline(sch: tvm.tir.Schedule, start_block: BlockRV) -> BlockRV:
             return cur_block
 
 
+def get_max_tile_size() -> int:
+    """Returns the max tile size.
+
+    This is assuming only threads in a warp can have coalesced accesses. 32 is the default if
+    no target information can be gotten.
+    """
+    max_tile_size = 32
+    cur_target = tvm.target.Target.current()
+    if cur_target is not None and hasattr(cur_target, "thread_warp_size"):
+        max_tile_size = int(cur_target.thread_warp_size)
+    return max_tile_size
+
+
 @tvm.register_func("meta_schedule.cuda.layout_transform")
 def cuda_layout_transform_schedule_rule(
     sch: tvm.tir.Schedule, block: BlockRV
@@ -456,7 +469,10 @@ def cuda_layout_transform_schedule_rule(
     Applies tiling scheme to layout transform task (potentially fused with other injective funcs).
 
     Returned schedules will be the default schedule, as well as tiled versions with tile_size in
-    the range of 2,3...32.
+    the range of 2,3...threads_per_warp.
+
+    This is assuming only threads in a warp can have coalesced accesses. 32 is the default if
+    no target information can be gotten.
 
     Parameters
     ----------
@@ -508,8 +524,9 @@ def cuda_layout_transform_schedule_rule(
         sch, block_read, input_shape, src_layout, dst_layout
     )
 
-    # Try tile size 2,3,4...32 as tile size of 1 has no coaslescing.
-    for tile_size in range(2, 33):
+    # Try tile size 2,3...threads_per_warp as tile size of 1 has no coaslescing.
+    max_tile_size = get_max_tile_size()
+    for tile_size in range(2, max_tile_size + 1):
         new_sch = sch.copy()
         tile_layout_transform(
             new_sch, block_read, block, src_layout, dst_layout, input_shape, tile_size
diff --git a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
index 7e314a50a97b..1cf0d893d3e6 100644
--- a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
+++ b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
@@ -47,7 +47,7 @@ Optional<Integer> ParseThreadBinding(const Schedule& sch, const Instruction& ins
     // and then later modify the schedule. As the passed in schedule is after running the entire trace
     // the bound loop may be moved around in the IRModule.
     // TODO: apply trace one inst at a time so schedule state is always accurate to instruction
-    LOG(WARNING) << "Failed to calculate extent so skipping RewriteCooperativeFetching. Error " << e.what();
+    LOG(DEBUG) << "Failed to calculate extent so skipping RewriteCooperativeFetching. Error " << e.what();
     return NullOpt;
   }
 }

From 1fb271b5fb5b8ff739db7aa6ac45bcbcf30541dd Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Mon, 13 Mar 2023 17:08:45 -0700
Subject: [PATCH 15/40] testing harness

---
 .../meta_schedule/schedule/cuda/layout_transform.py   | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
index fe66e49752a5..829bbff8dfeb 100644
--- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
+++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
@@ -1,6 +1,6 @@
 import math
 from collections import deque
-from typing import List, Sequence, Tuple, Union
+from typing import List, Optional, Sequence, Tuple, Union
 
 import tvm
 from tvm import topi
@@ -463,7 +463,7 @@ def get_max_tile_size() -> int:
 
 @tvm.register_func("meta_schedule.cuda.layout_transform")
 def cuda_layout_transform_schedule_rule(
-    sch: tvm.tir.Schedule, block: BlockRV
+    sch: tvm.tir.Schedule, block: BlockRV, tile_sizes: Optional[List[int]] = None
 ) -> List[tvm.tir.Schedule]:
     """
     Applies tiling scheme to layout transform task (potentially fused with other injective funcs).
@@ -483,6 +483,9 @@ def cuda_layout_transform_schedule_rule(
         The block corresponding to the layout transform.
         Should be a block which reads and writes to global memory, doing layout transform.
 
+    tile_sizes:
+        A list of tile sizes to try, overriding normal settings. For testing
+
     Returns
     -------
     ret:
@@ -526,7 +529,9 @@ def cuda_layout_transform_schedule_rule(
 
     # Try tile size 2,3...threads_per_warp as tile size of 1 has no coaslescing.
     max_tile_size = get_max_tile_size()
-    for tile_size in range(2, max_tile_size + 1):
+    if tile_sizes is None:
+        tile_sizes = range(2, max_tile_size + 1)
+    for tile_size in tile_sizes:
         new_sch = sch.copy()
         tile_layout_transform(
             new_sch, block_read, block, src_layout, dst_layout, input_shape, tile_size

From 31ca25a87ac49c9b5ad4c287e6d715c737dab5a4 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Mon, 13 Mar 2023 17:31:20 -0700
Subject: [PATCH 16/40] more progress on testing harness

---
 ...schedule_schedule_cuda_layout_transform.py | 179 ++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py

diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
new file mode 100644
index 000000000000..d842fd4aabff
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
@@ -0,0 +1,179 @@
+# Edge Cases:
+# 1. Fusion with ops
+# 2. Fusion with ops
+
+# Properties to test for
+# 1. Compiling -- compiles well without crashing
+# 2. Correctness when running
+# 3. Autotuning ability
+
+
+import itertools
+import random
+from typing import List, Optional, Tuple, Union
+
+import tvm
+from tvm import meta_schedule, relay
+from tvm.meta_schedule.schedule.cuda.layout_transform import cuda_layout_transform_schedule_rule
+from tvm.relay.op import OpPattern
+
+
+# Create unary functions which apply ops with compatible fusion levels
+def get_random_axis(data: relay.Expr):
+    rank = len(relay.transform.InferTypeLocal(data).shape)
+    return random.randint(0, rank - 1)
+
+
+def apply_elemwise_clip(data: relay.Expr, min=0, max=10):
+    assert relay.op.get("clip").get_attr("TOpPattern") == OpPattern.ELEMWISE
+    return relay.clip(data, min, max)
+
+
+def apply_broadcast_add(data: relay.Expr, val_to_add=5):
+    assert relay.op.get("add").get_attr("TOpPattern") == OpPattern.BROADCAST
+    type_info = relay.transform.InferTypeLocal(data)
+    return relay.add(data, relay.const(val_to_add, dtype=type_info.dtype))
+
+
+def apply_injective_concatenate(data: relay.Expr, axis=None):
+    if axis is None:
+        axis = get_random_axis(data)
+    assert relay.op.get("concatenate").get_attr("TOpPattern") == OpPattern.INJECTIVE
+    return relay.concatenate([data, data], axis)
+
+
+def apply_comm_reduce_max(data: relay.Expr, axis=None):
+    if axis is None:
+        axis = get_random_axis(data)
+    assert relay.op.get("max").get_attr("TOpPattern") == OpPattern.COMM_REDUCE
+
+    # Do this to maintain dimensions
+    return relay.add(data, relay.max(data, axis, keepdims=True))
+
+
+# Applying the actual layout transform will be different
+def apply_layout_transform(data: relay.Expr, src_layout: str, dst_layout: str):
+    assert relay.op.get("layout_transform").get_attr("TOpPattern") == OpPattern.INJECTIVE
+    return relay.layout_transform(data, src_layout, dst_layout)
+
+
+# These are the only levels of op which can possibly be fused with layout_transform (which injective)
+extra_pattern_level_to_op = {
+    OpPattern.ELEMWISE: apply_elemwise_clip,
+    OpPattern.BROADCAST: apply_broadcast_add,
+    OpPattern.INJECTIVE: apply_injective_concatenate,
+    OpPattern.COMM_REDUCE: apply_comm_reduce_max,
+}
+
+
+def create_relay_module(
+    input_shape: List[int], dtype: str, ops: List[Union[int, Tuple[str, str]]]
+) -> tvm.IRModule:
+    """Create a relay module with the given string of ops.
+
+    ops:
+        Applies the associated operators in order. If an integer, refers to applying
+        the unary operator from `extra_pattern_level_to_op` map. If a tuple, applies
+        a layout transform with the given (src_layout, dst_layout)
+    """
+    input_data = relay.var("input", shape=input_shape, dtype=dtype)
+
+    cur_data = input_data
+    for op_info in ops:
+        # Progressively build type info
+        relay.transform.InferTypeLocal(cur_data)
+        if isinstance(op_info, tuple):
+            # layout transform case
+            src_layout, dst_layout = op_info
+            cur_data = apply_layout_transform(cur_data, src_layout, dst_layout)
+        else:
+            cur_data = extra_pattern_level_to_op[op_info](cur_data)
+
+    relay.transform.InferTypeLocal(cur_data)
+    return tvm.IRModule.from_expr(cur_data)
+
+
+def generate_test_case(
+    input_shape: List[int],
+    implicit_reshape_info: Optional[Tuple[int, int]],
+    dtype: str,
+    num_additional_ops: int,
+):
+    # Create layout transforms
+    rank = len(input_shape)
+    src_layout = "".join([chr(i + ord("A")) for i in range(rank)])
+
+    dst_layout = list(src_layout)
+    if implicit_reshape_info:
+        axis_to_reshape, size_new_dim = implicit_reshape_info
+        cur_dim = dst_layout[axis_to_reshape]
+        dst_layout[axis_to_reshape] = f"{cur_dim}{size_new_dim}{cur_dim.lower()}"
+    random.shuffle(dst_layout)
+    dst_layout = "".join(dst_layout)
+
+    op_choices = random.choices(list(extra_pattern_level_to_op.keys()), k=num_additional_ops)
+    op_choices.append((src_layout, dst_layout))
+
+    random.shuffle(op_choices)
+    return create_relay_module(input_shape, dtype, op_choices)
+
+
+def verify_schedule(sch: tvm.tir.Schedule, tile_sizes: List[int]):
+    block_layout_transform = sch.get_block("T_layout_trans")
+    schedules = cuda_layout_transform_schedule_rule(sch, block_layout_transform, tile_sizes)
+
+    assert len(schedules) == len(tile_sizes) + 1
+
+    # This is the default schedule which does not apply the schedule rule
+    schedule_baseline = schedules[0]
+
+    # These are the tiled schedules we want to test
+    schedule_end = schedules[1:]
+    # TODO
+
+
+def generate_all_test_case(
+    # Each has ~10k elements
+    input_shapes: List[List[int]] = [
+        [12, 48, 18],
+        [890, 14],
+        [10, 12, 2, 5, 3, 3],
+    ],
+    implicit_reshape_conditions: List[Optional[Tuple[int, int]]] = [None, (0, 2), (1, 2)],
+    dtypes: List[str] = ["float32", "float16"],
+    num_additional_ops: int = 5,
+    tile_sizes: List[int] = [32, 20, 19],
+    repeats_per_condition=10,
+):
+    for _ in range(repeats_per_condition):
+        for input_shape, implicit_reshape_info, dtype in itertools.product(
+            input_shapes, implicit_reshape_conditions, dtypes
+        ):
+            # Generate random module of fusable ops + layout transform and extract fused layout transform task
+            mod = generate_test_case(input_shape, implicit_reshape_info, dtype, num_additional_ops)
+            extracted_tasks = meta_schedule.relay_integration.extract_tasks(
+                mod,
+                tvm.target.Target("cuda"),
+                {},
+                pass_config={
+                    "relay.backend.use_meta_schedule": True,
+                    "relay.FuseOps.max_depth": 30,
+                    "relay.backend.tir_converter": "default",
+                },
+            )
+            task_of_interest = None
+            for task in extracted_tasks:
+                if "layout_transform" in task.task_name:
+                    task_of_interest = task
+                    break
+            assert task_of_interest is not None
+
+            # Fused layout transform task
+            dispatched_mod = task_of_interest.dispatched[0]
+            base_schedule = tvm.tir.Schedule(dispatched_mod)
+            print(mod)
+            verify_schedule(base_schedule, tile_sizes)
+
+
+if __name__ == "__main__":
+    generate_all_test_case()

From 585191b20c38d5baf6bfac69ba3c72c167714edd Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Mon, 13 Mar 2023 18:03:08 -0700
Subject: [PATCH 17/40] fix case where shape changes in mod

---
 include/tvm/topi/transform.h                  |  3 +-
 .../schedule/cuda/layout_transform.py         |  9 ++----
 ...schedule_schedule_cuda_layout_transform.py | 30 ++++++++++++++++++-
 3 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index ff57f99075ff..aa75fb05a067 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -1619,7 +1619,8 @@ inline Tensor layout_transform(const Tensor& src, const std::string& src_layout,
   Map<String, ObjectRef> attrs = {{"schedule_rule", String(schedule_rule)},
                                   // Information about layouts needed for the schedule rule
                                   {"src_layout", String(src_layout)},
-                                  {"dst_layout", String(dst_layout)}};
+                                  {"dst_layout", String(dst_layout)},
+                                  {"input_shape", src->shape}};
 
   return compute(
       dst_shape,
diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
index 829bbff8dfeb..4c9fee2fb921 100644
--- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
+++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
@@ -187,7 +187,7 @@ def spin_out_factor(
         loops[index] = new_loop_split
         loops.append(factored_loop)
 
-        loop_extants[index] = math.ceil(new_size / split_factor)
+        loop_extants[index] = math.ceil(int(new_size) / int(split_factor))
         loop_extants.append(split_factor)
 
         sch.reorder(*loops)
@@ -491,14 +491,10 @@ def cuda_layout_transform_schedule_rule(
     ret:
         A list of new schedules to try.
     """
-    # params: input_buffer, output_buffer
-    params = sch.mod["main"].params
-    input_buffer = sch.mod["main"].buffer_map[params[0]]
-
     # Info needed for tiling
-    input_shape = [int(dim) for dim in input_buffer.shape]
     src_layout = sch.get_sref(block).stmt.annotations["src_layout"]
     dst_layout = sch.get_sref(block).stmt.annotations["dst_layout"]
+    input_shape = [int(c) for c in sch.get_sref(block).stmt.annotations["input_shape"]]
 
     schedules = []
 
@@ -526,7 +522,6 @@ def cuda_layout_transform_schedule_rule(
     input_shape, src_layout, dst_layout = handle_block_implicit_reshape(
         sch, block_read, input_shape, src_layout, dst_layout
     )
-
     # Try tile size 2,3...threads_per_warp as tile size of 1 has no coaslescing.
     max_tile_size = get_max_tile_size()
     if tile_sizes is None:
diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
index d842fd4aabff..0251a1d1f651 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
@@ -108,7 +108,10 @@ def generate_test_case(
         axis_to_reshape, size_new_dim = implicit_reshape_info
         cur_dim = dst_layout[axis_to_reshape]
         dst_layout[axis_to_reshape] = f"{cur_dim}{size_new_dim}{cur_dim.lower()}"
+
     random.shuffle(dst_layout)
+    while "".join(dst_layout) == src_layout:
+        random.shuffle(dst_layout)
     dst_layout = "".join(dst_layout)
 
     op_choices = random.choices(list(extra_pattern_level_to_op.keys()), k=num_additional_ops)
@@ -141,7 +144,7 @@ def generate_all_test_case(
     ],
     implicit_reshape_conditions: List[Optional[Tuple[int, int]]] = [None, (0, 2), (1, 2)],
     dtypes: List[str] = ["float32", "float16"],
-    num_additional_ops: int = 5,
+    num_additional_ops: int = 1,
     tile_sizes: List[int] = [32, 20, 19],
     repeats_per_condition=10,
 ):
@@ -176,4 +179,29 @@ def generate_all_test_case(
 
 
 if __name__ == "__main__":
+    mod = create_relay_module([890, 14], "float32", [("AB", "BA"), 2])
+    extracted_tasks = meta_schedule.relay_integration.extract_tasks(
+        mod,
+        tvm.target.Target("cuda"),
+        {},
+        pass_config={
+            "relay.backend.use_meta_schedule": True,
+            "relay.FuseOps.max_depth": 30,
+            "relay.backend.tir_converter": "default",
+        },
+    )
+    task_of_interest = None
+    for task in extracted_tasks:
+        if "layout_transform" in task.task_name:
+            task_of_interest = task
+            break
+    assert task_of_interest is not None
+
+    # # Fused layout transform task
+    # dispatched_mod = task_of_interest.dispatched[0]
+    # base_schedule = tvm.tir.Schedule(dispatched_mod)
+    # verify_schedule(base_schedule, [32, 20, 19])
+
+    breakpoint()
+
     generate_all_test_case()

From 9822f4ce13a072513782a485e667196957158d4c Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Mon, 13 Mar 2023 18:15:48 -0700
Subject: [PATCH 18/40] inline after schedule genreation to help analysis

---
 .../schedule/cuda/layout_transform.py         | 13 +++---
 ...schedule_schedule_cuda_layout_transform.py | 42 +++++++++----------
 2 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
index 4c9fee2fb921..e19c888fc387 100644
--- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
+++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
@@ -73,7 +73,7 @@ def tile_layout_transform(
     dst_layout: str,
     input_shape: List[int],
     tile_size: ExprRV,
-):
+) -> Tuple[BlockRV, BlockRV]:
     """
     High level tiling for layout transform block. Mutates sch in place.
 
@@ -298,6 +298,8 @@ def get_high_level_loop_structure(
     sch.bind(loop=inner_write_loop, thread_axis="threadIdx.x")
     sch.bind(loop=inner_read_loop, thread_axis="threadIdx.x")
 
+    return block_write, block_read
+
 
 def handle_block_implicit_reshape(
     sch: tvm.tir.Schedule,
@@ -502,10 +504,6 @@ def cuda_layout_transform_schedule_rule(
     schedules.append(sch)
     sch = sch.copy()
 
-    # For each schedule we also want to inline each stage as would be done in normal circumstances
-    # to prevent extraneous memory access.
-    block = auto_inline(sch, block)
-
     # Setup up basic structure of schedule of creating read into shared mem, before applying tiling
     # Outer loop structure of read block matches that of src_layout
     # E.g. if input_shape is [4, 6, 8]. Loops for read block will be
@@ -528,9 +526,12 @@ def cuda_layout_transform_schedule_rule(
         tile_sizes = range(2, max_tile_size + 1)
     for tile_size in tile_sizes:
         new_sch = sch.copy()
-        tile_layout_transform(
+        block_write, block_read = tile_layout_transform(
             new_sch, block_read, block, src_layout, dst_layout, input_shape, tile_size
         )
+        # For each schedule we also want to inline each stage as would be done in normal circumstances
+        # to prevent extraneous memory access.
+        auto_inline(new_sch, block_write)
         schedules.append(new_sch)
 
     return schedules
diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
index 0251a1d1f651..281fabbeb5e2 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
@@ -179,29 +179,29 @@ def generate_all_test_case(
 
 
 if __name__ == "__main__":
-    mod = create_relay_module([890, 14], "float32", [("AB", "BA"), 2])
-    extracted_tasks = meta_schedule.relay_integration.extract_tasks(
-        mod,
-        tvm.target.Target("cuda"),
-        {},
-        pass_config={
-            "relay.backend.use_meta_schedule": True,
-            "relay.FuseOps.max_depth": 30,
-            "relay.backend.tir_converter": "default",
-        },
-    )
-    task_of_interest = None
-    for task in extracted_tasks:
-        if "layout_transform" in task.task_name:
-            task_of_interest = task
-            break
-    assert task_of_interest is not None
-
-    # # Fused layout transform task
+    # mod = create_relay_module([12, 48, 18], "float32", [("ABC", "B2bAC"), 2])
+    # extracted_tasks = meta_schedule.relay_integration.extract_tasks(
+    #     mod,
+    #     tvm.target.Target("cuda"),
+    #     {},
+    #     pass_config={
+    #         "relay.backend.use_meta_schedule": True,
+    #         "relay.FuseOps.max_depth": 30,
+    #         "relay.backend.tir_converter": "default",
+    #     },
+    # )
+    # task_of_interest = None
+    # for task in extracted_tasks:
+    #     if "layout_transform" in task.task_name:
+    #         task_of_interest = task
+    #         break
+    # assert task_of_interest is not None
+
+    # # # Fused layout transform task
     # dispatched_mod = task_of_interest.dispatched[0]
     # base_schedule = tvm.tir.Schedule(dispatched_mod)
-    # verify_schedule(base_schedule, [32, 20, 19])
+    # verify_schedule(base_schedule, [30, 20, 19])
 
-    breakpoint()
+    # exit()
 
     generate_all_test_case()

From e152e394c58d891e46f777ae5623d296b3e6f713 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Tue, 14 Mar 2023 13:44:45 -0700
Subject: [PATCH 19/40] proper autoinlining INTO layout transform block to
 maintain extants

---
 .../schedule/cuda/layout_transform.py         | 41 +++++++++++--------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
index e19c888fc387..50388b34bca3 100644
--- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
+++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
@@ -3,7 +3,7 @@
 from typing import List, Optional, Sequence, Tuple, Union
 
 import tvm
-from tvm import topi
+from tvm import meta_schedule, topi
 from tvm.tir.schedule import BlockRV, ExprRV, LoopRV
 
 ## Tiling layout transforms:
@@ -412,11 +412,9 @@ def unpack_list(target_list) -> list:
     return unpack_list(input_shape), new_src_layout, new_dst_layout
 
 
-def auto_inline(sch: tvm.tir.Schedule, start_block: BlockRV) -> BlockRV:
+def auto_inline_into(sch: tvm.tir.Schedule, start_block: BlockRV) -> BlockRV:
     """
-    Autoinlines given block into consumers, and repeats process for consumer of block.
-
-    Done by default for injective schedules but must manually be called in new schedule rule.
+    Inlines given start_block's consumers and future dependencies into start_block.
 
     Parameters
     ----------
@@ -424,7 +422,7 @@ def auto_inline(sch: tvm.tir.Schedule, start_block: BlockRV) -> BlockRV:
         The initial schedule.
 
     start_block:
-        The block to inline, should be a block which reads and writes to global memory, doing
+        The block to inline into, should be a block which reads and writes to global memory, doing
         layout transform.
 
     Returns
@@ -432,7 +430,7 @@ def auto_inline(sch: tvm.tir.Schedule, start_block: BlockRV) -> BlockRV:
     ret:
         The new block inlined into it's consumers.
     """
-    fringe = deque([start_block])
+    fringe = deque(sch.get_consumers(start_block))
     visited = set()
     while len(fringe) > 0:
         cur_block = fringe.popleft()
@@ -442,12 +440,17 @@ def auto_inline(sch: tvm.tir.Schedule, start_block: BlockRV) -> BlockRV:
             visited.add(cur_block)
 
         consumer_blocks = sch.get_consumers(cur_block)
-        if len(consumer_blocks) >= 1:
-            fringe.extend(consumer_blocks)
-            sch.compute_inline(cur_block)
-        else:
-            # Found output block, no more inlining needed
-            return cur_block
+        fringe.extend(consumer_blocks)
+
+        autoinline_rule = meta_schedule.schedule_rule.AutoInline(
+            into_producer=True,
+            into_consumer=False,
+            inline_const_tensor=True,
+            disallow_if_then_else=False,
+            require_injective=False,
+            require_ordered=False,
+        )
+        sch = autoinline_rule.apply(sch, cur_block)[0]
 
 
 def get_max_tile_size() -> int:
@@ -504,6 +507,12 @@ def cuda_layout_transform_schedule_rule(
     schedules.append(sch)
     sch = sch.copy()
 
+    # Inline consumers of the layout transform into the layout transform block.
+    # Normally default for injective schedules but must manually be called in new schedule rule
+    # as we introduce a new block under the custom schedule rule which is not taken into account
+    # during search space generation. TODO: rectify this.
+    auto_inline_into(sch, block)
+
     # Setup up basic structure of schedule of creating read into shared mem, before applying tiling
     # Outer loop structure of read block matches that of src_layout
     # E.g. if input_shape is [4, 6, 8]. Loops for read block will be
@@ -520,18 +529,16 @@ def cuda_layout_transform_schedule_rule(
     input_shape, src_layout, dst_layout = handle_block_implicit_reshape(
         sch, block_read, input_shape, src_layout, dst_layout
     )
+
     # Try tile size 2,3...threads_per_warp as tile size of 1 has no coaslescing.
     max_tile_size = get_max_tile_size()
     if tile_sizes is None:
         tile_sizes = range(2, max_tile_size + 1)
     for tile_size in tile_sizes:
         new_sch = sch.copy()
-        block_write, block_read = tile_layout_transform(
+        tile_layout_transform(
             new_sch, block_read, block, src_layout, dst_layout, input_shape, tile_size
         )
-        # For each schedule we also want to inline each stage as would be done in normal circumstances
-        # to prevent extraneous memory access.
-        auto_inline(new_sch, block_write)
         schedules.append(new_sch)
 
     return schedules

From 8274d21ba07a0858f15e2eab0615862aac852e16 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Tue, 14 Mar 2023 15:32:02 -0700
Subject: [PATCH 20/40] clean up

---
 .../schedule/cuda/layout_transform.py         |  18 +-
 ...schedule_schedule_cuda_layout_transform.py | 165 ++++++++++++++----
 2 files changed, 144 insertions(+), 39 deletions(-)

diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
index 50388b34bca3..5a3637af94cf 100644
--- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
+++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
@@ -468,7 +468,7 @@ def get_max_tile_size() -> int:
 
 @tvm.register_func("meta_schedule.cuda.layout_transform")
 def cuda_layout_transform_schedule_rule(
-    sch: tvm.tir.Schedule, block: BlockRV, tile_sizes: Optional[List[int]] = None
+    sch: tvm.tir.Schedule, block: BlockRV, testing_tile_sizes: Optional[List[int]] = None
 ) -> List[tvm.tir.Schedule]:
     """
     Applies tiling scheme to layout transform task (potentially fused with other injective funcs).
@@ -488,8 +488,9 @@ def cuda_layout_transform_schedule_rule(
         The block corresponding to the layout transform.
         Should be a block which reads and writes to global memory, doing layout transform.
 
-    tile_sizes:
-        A list of tile sizes to try, overriding normal settings. For testing
+    testing_tile_sizes:
+        A list of tile sizes to try, overriding normal settings. For testing. None means
+        ignore. Else overrides normal settings of tile sizes to try.
 
     Returns
     -------
@@ -504,7 +505,8 @@ def cuda_layout_transform_schedule_rule(
     schedules = []
 
     # Always include the default schedules which will be handled via AutoBind schedule rule
-    schedules.append(sch)
+    if not testing_tile_sizes:
+        schedules.append(sch)
     sch = sch.copy()
 
     # Inline consumers of the layout transform into the layout transform block.
@@ -531,9 +533,11 @@ def cuda_layout_transform_schedule_rule(
     )
 
     # Try tile size 2,3...threads_per_warp as tile size of 1 has no coaslescing.
-    max_tile_size = get_max_tile_size()
-    if tile_sizes is None:
-        tile_sizes = range(2, max_tile_size + 1)
+    if testing_tile_sizes is None:
+        tile_sizes = range(2, get_max_tile_size() + 1)
+    else:
+        tile_sizes = testing_tile_sizes
+
     for tile_size in tile_sizes:
         new_sch = sch.copy()
         tile_layout_transform(
diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
index 281fabbeb5e2..fb4452bc8637 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
@@ -12,10 +12,13 @@
 import random
 from typing import List, Optional, Tuple, Union
 
+import numpy as np
+
 import tvm
 from tvm import meta_schedule, relay
 from tvm.meta_schedule.schedule.cuda.layout_transform import cuda_layout_transform_schedule_rule
 from tvm.relay.op import OpPattern
+from tvm.tir.schedule import BlockRV, ExprRV, LoopRV
 
 
 # Create unary functions which apply ops with compatible fusion levels
@@ -66,6 +69,29 @@ def apply_layout_transform(data: relay.Expr, src_layout: str, dst_layout: str):
 }
 
 
+class PatchCustomLayoutTransformScheduleRule:
+    """Patch the custom layout transform schedule to test only specific tile sizes."""
+
+    FUNC_NAME = "meta_schedule.cuda.layout_transform"
+
+    def __init__(self, tile_sizes: List[int]) -> None:
+        self.tile_sizes = tile_sizes
+        self.old_func = None
+
+    def __enter__(self, *args, **kwargs) -> None:
+        self.old_func = tvm.get_global_func(self.FUNC_NAME)
+
+        def new_layout_rule(
+            sch: tvm.tir.Schedule, block: BlockRV, tile_sizes: Optional[List[int]] = self.tile_sizes
+        ) -> List[tvm.tir.Schedule]:
+            return cuda_layout_transform_schedule_rule(sch, block, tile_sizes)
+
+        tvm.register_func(self.FUNC_NAME, new_layout_rule, override=True)
+
+    def __exit__(self, *args, **kwargs) -> None:
+        tvm.register_func(self.FUNC_NAME, self.old_func, override=True)
+
+
 def create_relay_module(
     input_shape: List[int], dtype: str, ops: List[Union[int, Tuple[str, str]]]
 ) -> tvm.IRModule:
@@ -98,7 +124,7 @@ def generate_test_case(
     implicit_reshape_info: Optional[Tuple[int, int]],
     dtype: str,
     num_additional_ops: int,
-):
+) -> tvm.IRModule:
     # Create layout transforms
     rank = len(input_shape)
     src_layout = "".join([chr(i + ord("A")) for i in range(rank)])
@@ -121,18 +147,101 @@ def generate_test_case(
     return create_relay_module(input_shape, dtype, op_choices)
 
 
-def verify_schedule(sch: tvm.tir.Schedule, tile_sizes: List[int]):
-    block_layout_transform = sch.get_block("T_layout_trans")
-    schedules = cuda_layout_transform_schedule_rule(sch, block_layout_transform, tile_sizes)
-
-    assert len(schedules) == len(tile_sizes) + 1
+def extract_layout_transform_task(
+    mod: tvm.IRModule, target: tvm.target.Target
+) -> Tuple[tvm.IRModule, tvm.IRModule]:
+    """Given a relay IRModule, return the PrimFunc IRModule with fused layout transform task."""
+    extracted_tasks = meta_schedule.relay_integration.extract_tasks(
+        mod,
+        target,
+        {},
+        pass_config={
+            "relay.backend.use_meta_schedule": True,
+            "relay.FuseOps.max_depth": 30,
+            "relay.backend.tir_converter": "default",
+        },
+    )
+    task_of_interest = None
+    for task in extracted_tasks:
+        if "layout_transform" in task.task_name:
+            task_of_interest = task
+            break
+    assert task_of_interest is not None
+
+    # Fused layout transform task
+    relay_mod = task_of_interest.mod
+    dispatched_mod = task_of_interest.dispatched[0]
+    return relay_mod, dispatched_mod
+
+
+def run_primfunc(
+    primfunc_mod: tvm.IRModule, target: tvm.target.Target, input_tensors: List[tvm.nd.NDArray]
+):
+    with tvm.transform.PassContext(
+        config={
+            "relay.backend.use_meta_schedule": True,
+            "relay.backend.use_meta_schedule_dispatch": False,
+            "relay.FuseOps.max_depth": 30,
+        },
+        opt_level=3,
+    ):
+        lib = tvm.build(primfunc_mod, target=target)
+    lib(*input_tensors)
+
+
+def verify_layout_transform_task(
+    relay_mod: tvm.IRModule,
+    dispatched_mod: tvm.IRModule,
+    target: tvm.target.Target,
+    tile_sizes: List[int],
+):
+    """Given a layout transform primfunc, tests the given tile_sizes and verifies output matches."""
+    space_generator = meta_schedule.space_generator.PostOrderApply(
+        sch_rules=meta_schedule.schedule_rule.schedule_rule.create("cuda"),
+        postprocs=meta_schedule.postproc.postproc.create("cuda"),
+        mutator_probs=meta_schedule.mutator.mutator.create("cuda"),
+    )
+    device = tvm.cuda(0)
+
+    func_type = relay.transform.InferTypeLocal(relay_mod[relay_mod.get_global_vars()[0]])
+    input_tensors = []
+    for input_type in func_type.arg_types:
+        orig_input_np = np.random.uniform(0, 10, size=list(map(int, input_type.shape))).astype(
+            input_type.dtype
+        )
+        input_tensors.append(tvm.nd.array(orig_input_np, device))
+    ret_type = func_type.ret_type
+
+    def get_output_tensor() -> Tuple[tvm.nd.NDArray, tvm.nd.NDArray]:
+        numpy_init = np.random.uniform(0, 1000, size=list(map(int, ret_type.shape))).astype(
+            ret_type.dtype
+        )
+        return tvm.nd.array(numpy_init, device)
+
+    def run_and_get_output(tile_size: Optional[int]) -> np.ndarray:
+        # By setting the tile_sizes to search to nothing, the layout transform rule just returns
+        # the original schedule.
+        tile_size_input = [] if tile_size is None else [tile_size]
+        with PatchCustomLayoutTransformScheduleRule(tile_sizes=tile_size_input):
+            tune_context = meta_schedule.TuneContext(
+                mod=dispatched_mod,
+                target=target,
+                space_generator=space_generator,
+                search_strategy=meta_schedule.search_strategy.create(),
+            )
+            tune_context.pre_tuning(32)
+            returned_primfunc = tune_context.generate_measure_candidates()[0].sch.mod
+            output_tensor = get_output_tensor()
+            run_primfunc(returned_primfunc, target, [*input_tensors, output_tensor])
+            # print(returned_primfunc)
+            return output_tensor.numpy()
 
-    # This is the default schedule which does not apply the schedule rule
-    schedule_baseline = schedules[0]
+    # Passing None, we basically do not apply the custom rule we have created.
+    ground_truth_np = run_and_get_output(None)
+    for tile_size in tile_sizes:
+        experimental_result_np = run_and_get_output(tile_size)
 
-    # These are the tiled schedules we want to test
-    schedule_end = schedules[1:]
-    # TODO
+        np.testing.assert_allclose(ground_truth_np, experimental_result_np)
 
 
 def generate_all_test_case(
@@ -144,38 +253,30 @@ def generate_all_test_case(
     ],
     implicit_reshape_conditions: List[Optional[Tuple[int, int]]] = [None, (0, 2), (1, 2)],
     dtypes: List[str] = ["float32", "float16"],
-    num_additional_ops: int = 1,
+    num_additional_ops: int = 0,
     tile_sizes: List[int] = [32, 20, 19],
     repeats_per_condition=10,
 ):
+    # Small numbers which should work for nearly every (modern-ish) gpu.
+    target = tvm.target.Target(
+        "cuda -max_threads_per_block=32 -max_num_threads=128 -thread_warp_size=32 -max_shared_memory_per_block=8192 -registers_per_block=1024"
+    )
     for _ in range(repeats_per_condition):
         for input_shape, implicit_reshape_info, dtype in itertools.product(
             input_shapes, implicit_reshape_conditions, dtypes
         ):
             # Generate random module of fusable ops + layout transform and extract fused layout transform task
-            mod = generate_test_case(input_shape, implicit_reshape_info, dtype, num_additional_ops)
-            extracted_tasks = meta_schedule.relay_integration.extract_tasks(
-                mod,
-                tvm.target.Target("cuda"),
-                {},
-                pass_config={
-                    "relay.backend.use_meta_schedule": True,
-                    "relay.FuseOps.max_depth": 30,
-                    "relay.backend.tir_converter": "default",
-                },
+            full_mod = generate_test_case(
+                input_shape, implicit_reshape_info, dtype, num_additional_ops
             )
-            task_of_interest = None
-            for task in extracted_tasks:
-                if "layout_transform" in task.task_name:
-                    task_of_interest = task
-                    break
-            assert task_of_interest is not None
 
             # Fused layout transform task
-            dispatched_mod = task_of_interest.dispatched[0]
-            base_schedule = tvm.tir.Schedule(dispatched_mod)
-            print(mod)
-            verify_schedule(base_schedule, tile_sizes)
+            relay_mod, dispatched_mod = extract_layout_transform_task(full_mod, target)
+
+            print(relay_mod)
+            verify_layout_transform_task(relay_mod, dispatched_mod, target, tile_sizes)
+            print("Verified!")
+            print()
 
 
 if __name__ == "__main__":

From 1c7aa1941d6cd0b96a4783c1ba4389c8872d83b1 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Wed, 15 Mar 2023 13:03:05 -0700
Subject: [PATCH 21/40] reindex for introducing cache block

---
 .../schedule/cuda/layout_transform.py         | 65 ++++++++++---------
 .../tvm/meta_schedule/schedule/cuda/test.py   | 25 +++++++
 2 files changed, 59 insertions(+), 31 deletions(-)
 create mode 100644 python/tvm/meta_schedule/schedule/cuda/test.py

diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
index 5a3637af94cf..aa2378443602 100644
--- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
+++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
@@ -301,9 +301,9 @@ def get_high_level_loop_structure(
     return block_write, block_read
 
 
-def handle_block_implicit_reshape(
+def create_cached_read(
     sch: tvm.tir.Schedule,
-    block_read: BlockRV,
+    block_write: BlockRV,
     orig_input_shape: Sequence[int],
     orig_src_layout: str,
     orig_dst_layout: str,
@@ -345,10 +345,7 @@ def handle_block_implicit_reshape(
         A tuple of the new input shape of shared memory buffer, the new src_layout and
         new dst_layout string.
     """
-    # Each loop should match src_layout dimension extants
-    loops = sch.get_loops(block_read)
-
-    # Figure out split dimensions, entries are (loop index in src_layout, split dimension)
+    # Figure out split dimensions, entries are (loop index in src_layout, split amount)
     split_dimensions: List[Tuple[int, int]] = []
 
     # This is without numeric digits, e.g. NCHW4c -> NCHWc
@@ -367,6 +364,11 @@ def handle_block_implicit_reshape(
             split_size = 0
             new_dst_layout.append(char)
 
+    # If no splits were detected we are done
+    if len(split_dimensions) == 0:
+        block_read = sch.cache_read(block_write, 0, "shared")
+        return block_read, orig_input_shape, orig_src_layout, orig_dst_layout
+
     # Calculate final input shapes, each of these are a single element for unsplit dims
     # and tuples for split dims associated with the two new axis
     input_shape: List[Union[int, Tuple]] = [i for i in orig_input_shape]
@@ -374,27 +376,10 @@ def handle_block_implicit_reshape(
     for src_layout_split_index, split_factor in split_dimensions:
         dimension_name = new_src_layout[src_layout_split_index]
         new_src_layout[src_layout_split_index] = (dimension_name, dimension_name.lower())
-
-        remain_factor = input_shape[src_layout_split_index] // split_factor
-        input_shape[src_layout_split_index] = (remain_factor, split_factor)
-
-        sch.split(loops[src_layout_split_index], [remain_factor, split_factor])
-
-    # Finally to help analyzer, make layout match that of output
-    def index_map(*loop_indices):
-        answer = []
-        assert len(loop_indices) == len(input_shape)
-        for dim, input_shape_solved in zip(loop_indices, input_shape):
-            if isinstance(input_shape_solved, tuple):
-                (_, split_factor) = input_shape_solved
-                answer.extend((dim // split_factor, dim % split_factor))
-            else:
-                answer.append(dim)
-        return tuple(answer)
-
-    sch.transform_layout(
-        block_read, buffer=("write", 0), index_map=index_map, assume_injective_transform=True
-    )
+        input_shape[src_layout_split_index] = (
+            input_shape[src_layout_split_index] // split_factor,
+            split_factor,
+        )
 
     # Unpack any tuples introduced via appending
     def unpack_list(target_list) -> list:
@@ -409,7 +394,25 @@ def unpack_list(target_list) -> list:
     new_src_layout = unpack_list(new_src_layout)
     new_src_layout = "".join(new_src_layout)
     new_dst_layout = "".join(new_dst_layout)
-    return unpack_list(input_shape), new_src_layout, new_dst_layout
+
+    # Write block loop extants match
+    reindex_map = [new_src_layout.index(dim) for dim in new_dst_layout]
+    block_read = sch.reindex_cache_read(
+        block_write,
+        read_buffer_index=0,
+        index_map=tvm.tir.IndexMap.from_func(
+            lambda *loops: [loops[reindex_map[i]] for i, _ in enumerate(loops)],
+            ndim=len(new_src_layout),
+        ),
+        storage_scope="shared",
+    )
+
+    # While the above will have the shared memory buffer match the reshaped input tensor
+    # the loops still match those of the write/output loop/buffer. Match the src layout instead
+    loops_read = sch.get_loops(block_read)
+    sch.reorder(*[loops_read[reindex_map[i]] for i, _ in enumerate(new_dst_layout)])
+
+    return block_read, unpack_list(input_shape), new_src_layout, new_dst_layout
 
 
 def auto_inline_into(sch: tvm.tir.Schedule, start_block: BlockRV) -> BlockRV:
@@ -522,14 +525,14 @@ def cuda_layout_transform_schedule_rule(
     #     ...
     # Read block will read from global memory coalesced at the start
     # Assume write to output global memory is coalesced in block_write
-    block_read = sch.cache_read(block, 0, "shared")
+    # block_read = sch.cache_read(block, 0, "shared")
 
     # Handle the case where there is an implicit reshape going on.
     # e.g. NCHW -> NCHW4c which is equivalent to reshaping NCHW
     # to NCcHW and then applying the new layout where the extant of c is 4.
     # Grab final input shape and src and dst layouts.
-    input_shape, src_layout, dst_layout = handle_block_implicit_reshape(
-        sch, block_read, input_shape, src_layout, dst_layout
+    block_read, input_shape, src_layout, dst_layout = create_cached_read(
+        sch, block, input_shape, src_layout, dst_layout
     )
 
     # Try tile size 2,3...threads_per_warp as tile size of 1 has no coaslescing.
diff --git a/python/tvm/meta_schedule/schedule/cuda/test.py b/python/tvm/meta_schedule/schedule/cuda/test.py
new file mode 100644
index 000000000000..33d56361f491
--- /dev/null
+++ b/python/tvm/meta_schedule/schedule/cuda/test.py
@@ -0,0 +1,25 @@
+# Read step
+print("read")
+for ax0_0_0_ax0_1_0_ax0_1_1_fused_0_fused in range(2):
+    for ax0_1_0_ax0_1_1_fused_1_fused_0_ax0_1_0_ax0_1_1_fused_1_fused_1_fused in range(3):
+        for ax0_0_1_fused_0_ax0_0_1_fused_1_fused in range(3):
+            v0 = (
+                ax0_0_0_ax0_1_0_ax0_1_1_fused_0_fused * (3) + ax0_0_1_fused_0_ax0_0_1_fused_1_fused
+            ) * (2) + ax0_1_0_ax0_1_1_fused_1_fused_0_ax0_1_0_ax0_1_1_fused_1_fused_1_fused
+            print(
+                f"block:{ax0_0_0_ax0_1_0_ax0_1_1_fused_0_fused} thread:{ax0_1_0_ax0_1_1_fused_1_fused_0_ax0_1_0_ax0_1_1_fused_1_fused_1_fused} -- index: {v0}"
+            )
+
+
+# Write step
+print("write")
+for ax0_0_0_ax0_1_0_ax0_1_1_fused_0_fused in range(2):
+    for ax0_ax1_fused_0 in range(3):
+        for ax0_ax1_fused_1 in range(3):
+            v_ax0 = (ax0_ax1_fused_0 * 3 + ax0_ax1_fused_1) // (4)
+            v_ax1 = ax0_0_0_ax0_1_0_ax0_1_1_fused_0_fused * (3) + (
+                ax0_ax1_fused_0 * (3) + ax0_ax1_fused_1
+            ) % (4)
+            print(
+                f"block:{ax0_0_0_ax0_1_0_ax0_1_1_fused_0_fused} thread:{ax0_ax1_fused_1}-- index: {v_ax0}, {v_ax1}"
+            )

From e70466477d9021b5e92c0d15a4423d11e9cfc9d3 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Wed, 15 Mar 2023 14:12:07 -0700
Subject: [PATCH 22/40] reorganize testing

---
 ...schedule_schedule_cuda_layout_transform.py | 512 +++++++++---------
 1 file changed, 265 insertions(+), 247 deletions(-)

diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
index fb4452bc8637..318af4326542 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
@@ -10,67 +10,26 @@
 
 import itertools
 import random
-from typing import List, Optional, Tuple, Union
+import tempfile
+from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
+import pytest
 
 import tvm
+import tvm.testing
 from tvm import meta_schedule, relay
 from tvm.meta_schedule.schedule.cuda.layout_transform import cuda_layout_transform_schedule_rule
 from tvm.relay.op import OpPattern
 from tvm.tir.schedule import BlockRV, ExprRV, LoopRV
 
 
-# Create unary functions which apply ops with compatible fusion levels
-def get_random_axis(data: relay.Expr):
-    rank = len(relay.transform.InferTypeLocal(data).shape)
-    return random.randint(0, rank - 1)
-
-
-def apply_elemwise_clip(data: relay.Expr, min=0, max=10):
-    assert relay.op.get("clip").get_attr("TOpPattern") == OpPattern.ELEMWISE
-    return relay.clip(data, min, max)
-
-
-def apply_broadcast_add(data: relay.Expr, val_to_add=5):
-    assert relay.op.get("add").get_attr("TOpPattern") == OpPattern.BROADCAST
-    type_info = relay.transform.InferTypeLocal(data)
-    return relay.add(data, relay.const(val_to_add, dtype=type_info.dtype))
-
-
-def apply_injective_concatenate(data: relay.Expr, axis=None):
-    if axis is None:
-        axis = get_random_axis(data)
-    assert relay.op.get("concatenate").get_attr("TOpPattern") == OpPattern.INJECTIVE
-    return relay.concatenate([data, data], axis)
-
-
-def apply_comm_reduce_max(data: relay.Expr, axis=None):
-    if axis is None:
-        axis = get_random_axis(data)
-    assert relay.op.get("max").get_attr("TOpPattern") == OpPattern.COMM_REDUCE
-
-    # Do this to maintain dimensions
-    return relay.add(data, relay.max(data, axis, keepdims=True))
-
-
-# Applying the actual layout transform will be different
-def apply_layout_transform(data: relay.Expr, src_layout: str, dst_layout: str):
-    assert relay.op.get("layout_transform").get_attr("TOpPattern") == OpPattern.INJECTIVE
-    return relay.layout_transform(data, src_layout, dst_layout)
-
-
-# These are the only levels of op which can possibly be fused with layout_transform (which injective)
-extra_pattern_level_to_op = {
-    OpPattern.ELEMWISE: apply_elemwise_clip,
-    OpPattern.BROADCAST: apply_broadcast_add,
-    OpPattern.INJECTIVE: apply_injective_concatenate,
-    OpPattern.COMM_REDUCE: apply_comm_reduce_max,
-}
-
-
 class PatchCustomLayoutTransformScheduleRule:
-    """Patch the custom layout transform schedule to test only specific tile sizes."""
+    """Patch the custom layout transform schedule to test only specific tile sizes.
+
+    If tile_sizes = [], then returns the default (non-tiled) schedule, otherwise
+    returns only the schedule with the given tiles.
+    """
 
     FUNC_NAME = "meta_schedule.cuda.layout_transform"
 
@@ -82,7 +41,9 @@ def __enter__(self, *args, **kwargs) -> None:
         self.old_func = tvm.get_global_func(self.FUNC_NAME)
 
         def new_layout_rule(
-            sch: tvm.tir.Schedule, block: BlockRV, tile_sizes: Optional[List[int]] = self.tile_sizes
+            sch: tvm.tir.Schedule,
+            block: BlockRV,
+            tile_sizes: Optional[List[int]] = self.tile_sizes,
         ) -> List[tvm.tir.Schedule]:
             return cuda_layout_transform_schedule_rule(sch, block, tile_sizes)
 
@@ -92,217 +53,274 @@ def __exit__(self, *args, **kwargs) -> None:
         tvm.register_func(self.FUNC_NAME, self.old_func, override=True)
 
 
-def create_relay_module(
-    input_shape: List[int], dtype: str, ops: List[Union[int, Tuple[str, str]]]
-) -> tvm.IRModule:
-    """Create a relay module with the given string of ops.
+class TestRandomRelayE2ECorrectness:
+    """Tests E2E correctness of layout transform schedule.
 
-    ops:
-        Applies the associated operators in order. If an integer, refers to applying
-        the unary operator from `extra_pattern_level_to_op` map. If a tuple, applies
-        a layout transform with the given (src_layout, dst_layout)
+    Randomly generates relay mod with layout transform and fusable ops. Checks the
+    layout transform task for correctness by comparing against its unscheduled result.
     """
-    input_data = relay.var("input", shape=input_shape, dtype=dtype)
 
-    cur_data = input_data
-    for op_info in ops:
-        # Progressively build type info
+    # Create unary functions which apply ops with compatible fusion levels to layout transform
+    @staticmethod
+    def get_random_axis(data: relay.Expr):
+        rank = len(relay.transform.InferTypeLocal(data).shape)
+        return random.randint(0, rank - 1)
+
+    @staticmethod
+    def apply_elemwise_clip(data: relay.Expr, min=0, max=10):
+        assert relay.op.get("clip").get_attr("TOpPattern") == OpPattern.ELEMWISE
+        return relay.clip(data, min, max)
+
+    @staticmethod
+    def apply_broadcast_add(data: relay.Expr, val_to_add=5):
+        assert relay.op.get("add").get_attr("TOpPattern") == OpPattern.BROADCAST
+        type_info = relay.transform.InferTypeLocal(data)
+        return relay.add(data, relay.const(val_to_add, dtype=type_info.dtype))
+
+    @staticmethod
+    def apply_injective_concatenate(data: relay.Expr, axis=None):
+        if axis is None:
+            axis = TestRandomRelayE2ECorrectness.get_random_axis(data)
+        assert relay.op.get("concatenate").get_attr("TOpPattern") == OpPattern.INJECTIVE
+        return relay.concatenate([data, data], axis)
+
+    @staticmethod
+    def apply_comm_reduce_max(data: relay.Expr, axis=None):
+        if axis is None:
+            axis = TestRandomRelayE2ECorrectness.get_random_axis(data)
+        assert relay.op.get("max").get_attr("TOpPattern") == OpPattern.COMM_REDUCE
+
+        # Do this to maintain dimensions
+        return relay.add(data, relay.max(data, axis, keepdims=True))
+
+    @staticmethod
+    def get_map_pattern_level_to_op() -> Dict[OpPattern, Callable]:
+        # These are the only levels of op which can possibly be fused with layout_transform (which injective)
+        return {
+            OpPattern.ELEMWISE: TestRandomRelayE2ECorrectness.apply_elemwise_clip,
+            OpPattern.BROADCAST: TestRandomRelayE2ECorrectness.apply_broadcast_add,
+            OpPattern.INJECTIVE: TestRandomRelayE2ECorrectness.apply_injective_concatenate,
+            OpPattern.COMM_REDUCE: TestRandomRelayE2ECorrectness.apply_comm_reduce_max,
+        }
+
+    @staticmethod
+    def apply_layout_transform(data: relay.Expr, src_layout: str, dst_layout: str):
+        assert relay.op.get("layout_transform").get_attr("TOpPattern") == OpPattern.INJECTIVE
+        return relay.layout_transform(data, src_layout, dst_layout)
+
+    @staticmethod
+    def create_relay_module(
+        input_shape: List[int], dtype: str, ops: List[Union[OpPattern, Tuple[str, str]]]
+    ) -> tvm.IRModule:
+        """Create a relay module with the given string of ops.
+
+        ops:
+            Applies the associated operators in order. If an integer, refers to applying
+            the unary operator from `extra_pattern_level_to_op` map. If a tuple, applies
+            a layout transform with the given (src_layout, dst_layout)
+        """
+        input_data = relay.var("input", shape=input_shape, dtype=dtype)
+
+        cur_data = input_data
+        for op_info in ops:
+            # Progressively build type info
+            relay.transform.InferTypeLocal(cur_data)
+            if isinstance(op_info, tuple):
+                # layout transform case
+                src_layout, dst_layout = op_info
+                cur_data = TestRandomRelayE2ECorrectness.apply_layout_transform(
+                    cur_data, src_layout, dst_layout
+                )
+            else:
+                cur_data = TestRandomRelayE2ECorrectness.get_map_pattern_level_to_op()[op_info](
+                    cur_data
+                )
+
         relay.transform.InferTypeLocal(cur_data)
-        if isinstance(op_info, tuple):
-            # layout transform case
-            src_layout, dst_layout = op_info
-            cur_data = apply_layout_transform(cur_data, src_layout, dst_layout)
-        else:
-            cur_data = extra_pattern_level_to_op[op_info](cur_data)
-
-    relay.transform.InferTypeLocal(cur_data)
-    return tvm.IRModule.from_expr(cur_data)
-
-
-def generate_test_case(
-    input_shape: List[int],
-    implicit_reshape_info: Optional[Tuple[int, int]],
-    dtype: str,
-    num_additional_ops: int,
-) -> tvm.IRModule:
-    # Create layout transforms
-    rank = len(input_shape)
-    src_layout = "".join([chr(i + ord("A")) for i in range(rank)])
-
-    dst_layout = list(src_layout)
-    if implicit_reshape_info:
-        axis_to_reshape, size_new_dim = implicit_reshape_info
-        cur_dim = dst_layout[axis_to_reshape]
-        dst_layout[axis_to_reshape] = f"{cur_dim}{size_new_dim}{cur_dim.lower()}"
-
-    random.shuffle(dst_layout)
-    while "".join(dst_layout) == src_layout:
-        random.shuffle(dst_layout)
-    dst_layout = "".join(dst_layout)
-
-    op_choices = random.choices(list(extra_pattern_level_to_op.keys()), k=num_additional_ops)
-    op_choices.append((src_layout, dst_layout))
-
-    random.shuffle(op_choices)
-    return create_relay_module(input_shape, dtype, op_choices)
-
-
-def extract_layout_transform_task(
-    mod: tvm.IRModule, target: tvm.target.Target
-) -> Tuple[tvm.IRModule, tvm.IRModule]:
-    """Given a relay IRModule, return the PrimFunc IRModule with fused layout transform task."""
-    extracted_tasks = meta_schedule.relay_integration.extract_tasks(
-        mod,
-        target,
-        {},
-        pass_config={
-            "relay.backend.use_meta_schedule": True,
-            "relay.FuseOps.max_depth": 30,
-            "relay.backend.tir_converter": "default",
-        },
-    )
-    task_of_interest = None
-    for task in extracted_tasks:
-        if "layout_transform" in task.task_name:
-            task_of_interest = task
-            break
-    assert task_of_interest is not None
-
-    # Fused layout transform task
-    relay_mod = task_of_interest.mod
-    dispatched_mod = task_of_interest.dispatched[0]
-    return relay_mod, dispatched_mod
-
-
-def run_primfunc(
-    primfunc_mod: tvm.IRModule, target: tvm.target.Target, input_tensors: List[tvm.nd.NDArray]
-):
-    with tvm.transform.PassContext(
-        config={
-            "relay.backend.use_meta_schedule": True,
-            "relay.backend.use_meta_schedule_dispatch": False,
-            "relay.FuseOps.max_depth": 30,
-        },
-        opt_level=3,
-    ):
-        lib = tvm.build(primfunc_mod, target=target)
-    lib(*input_tensors)
-
-
-def verify_layout_transform_task(
-    relay_mod: tvm.IRModule,
-    dispatched_mod: tvm.IRModule,
-    target: tvm.target.Target,
-    tile_sizes: List[int],
-):
-    """Given a layout transform primfunc, tests the given tile_sizes and verifies output matches."""
-    space_generator = meta_schedule.space_generator.PostOrderApply(
-        sch_rules=meta_schedule.schedule_rule.schedule_rule.create("cuda"),
-        postprocs=meta_schedule.postproc.postproc.create("cuda"),
-        mutator_probs=meta_schedule.mutator.mutator.create("cuda"),
-    )
-    device = tvm.cuda(0)
+        return tvm.IRModule.from_expr(cur_data)
+
+    @staticmethod
+    def generate_test_case(
+        input_shape: List[int],
+        implicit_reshape_info: Optional[Tuple[int, int]],
+        dtype: str,
+        num_additional_ops: int,
+    ) -> tvm.IRModule:
+        """Creates a random layout transform module with up to num_additional_ops fused."""
+        # Create layout transforms
+        rank = len(input_shape)
+
+        # src_layout is a string like ABCDEFG... with length as rank
+        src_layout = "".join([chr(i + ord("A")) for i in range(rank)])
+
+        # dst_layout is randomly shuffled src_layout, potentially after adding split axis
+        dst_layout = list(src_layout)
+        if implicit_reshape_info:
+            axis_to_reshape, size_new_dim = implicit_reshape_info
+            cur_dim = dst_layout[axis_to_reshape]
+            dst_layout[axis_to_reshape] = f"{cur_dim}"
+            dst_layout.append(f"{size_new_dim}{cur_dim.lower()}")
 
-    func_type = relay.transform.InferTypeLocal(relay_mod[relay_mod.get_global_vars()[0]])
-    input_tensors = []
-    for input_type in func_type.arg_types:
-        orig_input_np = np.random.uniform(0, 10, size=list(map(int, input_type.shape))).astype(
-            input_type.dtype
+        random.shuffle(dst_layout)
+        while "".join(dst_layout) == src_layout:
+            random.shuffle(dst_layout)
+        dst_layout = "".join(dst_layout)
+
+        # Randomly sample a list of potentially fusable ops to layout transform
+        op_order = random.choices(
+            list(TestRandomRelayE2ECorrectness.get_map_pattern_level_to_op().keys()),
+            k=num_additional_ops,
         )
-        input_tensors.append(tvm.nd.array(orig_input_np, device))
-    ret_type = func_type.ret_type
 
-    def get_output_tensor() -> Tuple[tvm.nd.NDArray, tvm.nd.NDArray]:
-        numpy_init = np.random.uniform(0, 1000, size=list(map(int, ret_type.shape))).astype(
-            ret_type.dtype
+        # Append tuple, representing layout transfomr from src --> dst layout
+        op_order.append((src_layout, dst_layout))
+
+        random.shuffle(op_order)
+        return TestRandomRelayE2ECorrectness.create_relay_module(input_shape, dtype, op_order)
+
+    @staticmethod
+    def extract_layout_transform_task(
+        mod: tvm.IRModule, target: tvm.target.Target
+    ) -> meta_schedule.ExtractedTask:
+        """Given a relay IRModule, return the PrimFunc IRModule with fused layout transform task."""
+        extracted_tasks = meta_schedule.relay_integration.extract_tasks(
+            mod,
+            target,
+            {},
+            pass_config={"relay.backend.use_meta_schedule": True},
         )
-        return tvm.nd.array(numpy_init, device)
-
-    def run_and_get_output(tile_size: Optional[int]) -> np.ndarray:
-        # By setting the tile_sizes to search to nothing, the layout transform rule just returns
-        # the original schedule.
-        tile_size_input = [] if tile_size is None else [tile_size]
-        with PatchCustomLayoutTransformScheduleRule(tile_sizes=tile_size_input):
-            tune_context = meta_schedule.TuneContext(
-                mod=dispatched_mod,
-                target=target,
-                space_generator=space_generator,
-                search_strategy=meta_schedule.search_strategy.create(),
+        task_of_interest = None
+        for task in extracted_tasks:
+            if "layout_transform" in task.task_name:
+                task_of_interest = task
+                break
+        assert task_of_interest is not None
+        return task_of_interest
+
+    @staticmethod
+    def run_primfunc(
+        primfunc_mod: tvm.IRModule, target: tvm.target.Target, input_tensors: List[tvm.nd.NDArray]
+    ):
+        """Compile and run the primfunc with the given input tensors."""
+        with tvm.transform.PassContext(
+            config={"relay.backend.use_meta_schedule": True},
+            opt_level=3,
+        ):
+            lib = tvm.build(primfunc_mod, target=target)
+        lib(*input_tensors)
+
+    @staticmethod
+    def get_primfunc(extracted_task: meta_schedule.ExtractedTask, tile_size: Optional[int]):
+        with PatchCustomLayoutTransformScheduleRule(
+            tile_sizes=[] if tile_size is None else [tile_size]
+        ):
+            with tempfile.TemporaryDirectory() as tmpdir:
+                (
+                    tune_contexts,
+                    _,
+                ) = meta_schedule.relay_integration.extracted_tasks_to_tune_contexts(
+                    [extracted_task],
+                    tmpdir,
+                )
+                tune_contexts[0].pre_tuning(1)
+                candidates = tune_contexts[0].generate_measure_candidates()
+                primfunc = candidates[0].sch.mod["main"]
+                return primfunc
+
+    @staticmethod
+    def verify_layout_transform_task(
+        extracted_task: meta_schedule.ExtractedTask,
+        target: tvm.target.Target,
+        tile_sizes: List[int],
+    ):
+        """Given a layout transform task, tests the given tile_sizes and verifies output matches."""
+        device = tvm.cuda(0)
+        relay_mod = extracted_task.mod
+
+        # Create and cache inputs
+        func_type = relay.transform.InferTypeLocal(relay_mod[relay_mod.get_global_vars()[0]])
+        input_tensors = []
+        for input_type in func_type.arg_types:
+            orig_input_np = np.random.uniform(0, 10, size=list(map(int, input_type.shape))).astype(
+                input_type.dtype
+            )
+            orig_input_np = np.arange(0, orig_input_np.size, dtype=input_type.dtype).reshape(
+                orig_input_np.shape
+            )
+            input_tensors.append(tvm.nd.array(orig_input_np, device))
+        ret_type = func_type.ret_type
+
+        def get_output_tensor() -> Tuple[tvm.nd.NDArray, tvm.nd.NDArray]:
+            numpy_init = np.random.uniform(0, 1000, size=list(map(int, ret_type.shape))).astype(
+                ret_type.dtype
+            )
+            return tvm.nd.array(numpy_init, device)
+
+        def run_and_get_output(tile_size: Optional[int]) -> np.ndarray:
+            returned_primfunc = TestRandomRelayE2ECorrectness.get_primfunc(
+                extracted_task, tile_size
             )
-            tune_context.pre_tuning(32)
-            returned_primfunc = tune_context.generate_measure_candidates()[0].sch.mod
             output_tensor = get_output_tensor()
-            run_primfunc(returned_primfunc, target, [*input_tensors, output_tensor])
-            # print(returned_primfunc)
+            TestRandomRelayE2ECorrectness.run_primfunc(
+                returned_primfunc, target, [*input_tensors, output_tensor]
+            )
             return output_tensor.numpy()
 
-    # Passing None, we basically do not apply the custom rule we have created.
-    ground_truth_np = run_and_get_output(None)
-    for tile_size in tile_sizes:
-        experimental_result_np = run_and_get_output(tile_size)
-
-        np.testing.assert_allclose(ground_truth_np, experimental_result_np)
-
-
-def generate_all_test_case(
-    # Each has ~10k elements
-    input_shapes: List[List[int]] = [
-        [12, 48, 18],
-        [890, 14],
-        [10, 12, 2, 5, 3, 3],
-    ],
-    implicit_reshape_conditions: List[Optional[Tuple[int, int]]] = [None, (0, 2), (1, 2)],
-    dtypes: List[str] = ["float32", "float16"],
-    num_additional_ops: int = 0,
-    tile_sizes: List[int] = [32, 20, 19],
-    repeats_per_condition=10,
-):
-    # Small numbers which should work for nearly every (modern-ish) gpu.
-    target = tvm.target.Target(
-        "cuda -max_threads_per_block=32 -max_num_threads=128 -thread_warp_size=32 -max_shared_memory_per_block=8192 -registers_per_block=1024"
+        # Passing None, we basically do not apply the custom rule we have created
+        # and instead use the old default schedule which is the ground truth.
+        ground_truth_np = run_and_get_output(None)
+
+        for tile_size in tile_sizes:
+            experimental_np = run_and_get_output(tile_size)
+            np.testing.assert_allclose(ground_truth_np, experimental_np)
+
+    input_shape, implicit_reshape_info, dtype, tile_sizes = tvm.testing.parameters(
+        *itertools.product(
+            # InputShapes: Each has ~10k elements, should take single microseconds on modern gpu
+            [
+                [12, 48, 18],
+                [890, 14],
+                [10, 12, 2, 5, 3, 3],
+            ],
+            # Implicit reshape conditions.
+            # None is do no implicit reshape, (0, 2) means divide axis 0 in half, e.g. AB --> A2aB
+            [None, (0, 2), (1, 2)],
+            # Dtypes to test, should not matter that much
+            ["float16"],
+            # Tile sizes to try
+            [[8, 7]],
+        )
     )
-    for _ in range(repeats_per_condition):
-        for input_shape, implicit_reshape_info, dtype in itertools.product(
-            input_shapes, implicit_reshape_conditions, dtypes
-        ):
-            # Generate random module of fusable ops + layout transform and extract fused layout transform task
-            full_mod = generate_test_case(
-                input_shape, implicit_reshape_info, dtype, num_additional_ops
-            )
 
-            # Fused layout transform task
-            relay_mod, dispatched_mod = extract_layout_transform_task(full_mod, target)
+    @tvm.testing.requires_gpu
+    def test_all_test_case(
+        self,
+        input_shape,
+        implicit_reshape_info,
+        dtype,
+        tile_sizes,
+        # number of non-layout transform ops to include and may be fused
+        num_additional_ops: int = 5,
+    ):
+        """Tests the product of all conditions `repeat_per_condition` times."""
+        # Small gpu parameters which should work for nearly every (modern-ish) gpu.
+        target = tvm.target.Target(
+            "cuda -max_threads_per_block=32 -max_num_threads=128 -thread_warp_size=32 -max_shared_memory_per_block=8192 -registers_per_block=1024"
+        )
+
+        # Generate random module of fusable ops + layout transform and extract fused layout transform task
+        full_mod = self.generate_test_case(
+            input_shape, implicit_reshape_info, dtype, num_additional_ops
+        )
 
-            print(relay_mod)
-            verify_layout_transform_task(relay_mod, dispatched_mod, target, tile_sizes)
-            print("Verified!")
-            print()
+        # Fused layout transform task
+        extracted_task = self.extract_layout_transform_task(full_mod, target)
+        print(full_mod)
+        print(extracted_task.task_name)
+        self.verify_layout_transform_task(extracted_task, target, tile_sizes)
+        print("Done!")
+        print()
 
 
 if __name__ == "__main__":
-    # mod = create_relay_module([12, 48, 18], "float32", [("ABC", "B2bAC"), 2])
-    # extracted_tasks = meta_schedule.relay_integration.extract_tasks(
-    #     mod,
-    #     tvm.target.Target("cuda"),
-    #     {},
-    #     pass_config={
-    #         "relay.backend.use_meta_schedule": True,
-    #         "relay.FuseOps.max_depth": 30,
-    #         "relay.backend.tir_converter": "default",
-    #     },
-    # )
-    # task_of_interest = None
-    # for task in extracted_tasks:
-    #     if "layout_transform" in task.task_name:
-    #         task_of_interest = task
-    #         break
-    # assert task_of_interest is not None
-
-    # # # Fused layout transform task
-    # dispatched_mod = task_of_interest.dispatched[0]
-    # base_schedule = tvm.tir.Schedule(dispatched_mod)
-    # verify_schedule(base_schedule, [30, 20, 19])
-
-    # exit()
-
-    generate_all_test_case()
+    tvm.testing.main()

From 67c2db9aa00e6aa6a99b0b4b4bfee1aa37600ed2 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Wed, 15 Mar 2023 14:23:49 -0700
Subject: [PATCH 23/40] more cleanup

---
 ...schedule_schedule_cuda_layout_transform.py | 26 +++++++++++--------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
index 318af4326542..4e16fc9c16f7 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
@@ -273,22 +273,31 @@ def run_and_get_output(tile_size: Optional[int]) -> np.ndarray:
         for tile_size in tile_sizes:
             experimental_np = run_and_get_output(tile_size)
             np.testing.assert_allclose(ground_truth_np, experimental_np)
+            assert False
 
-    input_shape, implicit_reshape_info, dtype, tile_sizes = tvm.testing.parameters(
+    (
+        input_shape,
+        implicit_reshape_info,
+        dtype,
+        tile_sizes,
+        num_additional_ops,
+    ) = tvm.testing.parameters(
         *itertools.product(
-            # InputShapes: Each has ~10k elements, should take single microseconds on modern gpu
+            # input_shape: Each has ~10k elements, should take single microseconds on modern gpu
             [
                 [12, 48, 18],
                 [890, 14],
                 [10, 12, 2, 5, 3, 3],
             ],
-            # Implicit reshape conditions.
+            # implicit_reshape_info: Implicit reshape conditions.
             # None is do no implicit reshape, (0, 2) means divide axis 0 in half, e.g. AB --> A2aB
             [None, (0, 2), (1, 2)],
-            # Dtypes to test, should not matter that much
+            # dtype: dtypes to test, should not matter that much
             ["float16"],
-            # Tile sizes to try
+            # tile_sizes: Tile sizes to try
             [[8, 7]],
+            # num_additional_ops: number of non-layout transform ops to include and may be fused
+            [5],
         )
     )
 
@@ -299,8 +308,7 @@ def test_all_test_case(
         implicit_reshape_info,
         dtype,
         tile_sizes,
-        # number of non-layout transform ops to include and may be fused
-        num_additional_ops: int = 5,
+        num_additional_ops,
     ):
         """Tests the product of all conditions `repeat_per_condition` times."""
         # Small gpu parameters which should work for nearly every (modern-ish) gpu.
@@ -315,11 +323,7 @@ def test_all_test_case(
 
         # Fused layout transform task
         extracted_task = self.extract_layout_transform_task(full_mod, target)
-        print(full_mod)
-        print(extracted_task.task_name)
         self.verify_layout_transform_task(extracted_task, target, tile_sizes)
-        print("Done!")
-        print()
 
 
 if __name__ == "__main__":

From 6e1ea6bc0a47476e2b57f09583be96905009ca40 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Wed, 15 Mar 2023 14:57:46 -0700
Subject: [PATCH 24/40] remove forced false

---
 .../test_meta_schedule_schedule_cuda_layout_transform.py         | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
index 4e16fc9c16f7..4058a7644548 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
@@ -273,7 +273,6 @@ def run_and_get_output(tile_size: Optional[int]) -> np.ndarray:
         for tile_size in tile_sizes:
             experimental_np = run_and_get_output(tile_size)
             np.testing.assert_allclose(ground_truth_np, experimental_np)
-            assert False
 
     (
         input_shape,

From 8eace3073583a8249b87ed6677680e6965104ab5 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Wed, 15 Mar 2023 15:27:20 -0700
Subject: [PATCH 25/40] use the proper dispatcher

---
 python/tvm/relay/op/strategy/cuda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 6111ceed0f98..0c39d12f318d 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -1403,7 +1403,7 @@ def layout_transform_strategy(attrs, inputs, out_type, target):
     strategy = _op.OpStrategy()
     strategy.add_implementation(
         wrap_compute_layout_transform(topi.layout_transform, schedule_rule="layout_transform"),
-        wrap_topi_schedule(topi.cuda.schedule_injective),
+        schedule_injective,
         name="layout_transform.cuda",
     )
     return strategy

From 8eb6f8b108f100664a37df477ff25f785e9e07b4 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Thu, 16 Mar 2023 09:49:15 -0700
Subject: [PATCH 26/40] update test, make default schedule rule None

---
 python/tvm/relay/op/strategy/generic.py                       | 2 +-
 tests/python/unittest/test_meta_schedule_relay_integration.py | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index d53cd045383b..2883e5e1fb77 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -2070,7 +2070,7 @@ def layout_transform_strategy(attrs, inputs, out_type, target):
     return strategy
 
 
-def wrap_compute_layout_transform(topi_compute, schedule_rule=""):
+def wrap_compute_layout_transform(topi_compute, schedule_rule="None"):
     """Wrap layout transform compute"""
 
     def _compute_layout_transform(attrs, inputs, output_type):
diff --git a/tests/python/unittest/test_meta_schedule_relay_integration.py b/tests/python/unittest/test_meta_schedule_relay_integration.py
index f1d74348db17..ee148db94d0a 100644
--- a/tests/python/unittest/test_meta_schedule_relay_integration.py
+++ b/tests/python/unittest/test_meta_schedule_relay_integration.py
@@ -20,6 +20,7 @@
 
 import numpy as np
 import pytest
+
 import tvm
 import tvm.testing
 from tvm import IRModule
@@ -420,6 +421,7 @@ def main( # type: ignore
                     ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
                     T.reads(placeholder[ax0, ax1 * T.int64(3) + ax4, ax2, ax3])
                     T.writes(T_layout_trans[ax0, ax1, ax2, ax3, ax4])
+                    T.block_attr({"dst_layout": "NCHW3c", "input_shape": [1, 3, 16, 16], "schedule_rule": "None", "src_layout": "NCHW"})
                     T_layout_trans[ax0, ax1, ax2, ax3, ax4] = T.if_then_else(
                         ax0 < T.int64(1) and ax1 * T.int64(3) + ax4 < T.int64(3) and ax2 < T.int64(16) and ax3 < T.int64(16), # type: ignore
                         placeholder[ax0, ax1 * T.int64(3) + ax4, ax2, ax3],
@@ -440,6 +442,7 @@ def main(placeholder: T.Buffer((T.int64(1), T.int64(2), T.int64(16), T.int64(16)
                     ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
                     T.reads(placeholder[ax0, ax1 // T.int64(4), ax2, ax3, ax1 % T.int64(4)]) # type: ignore
                     T.writes(T_layout_trans[ax0, ax1, ax2, ax3])
+                    T.block_attr({"dst_layout": "NCHW", "input_shape": [1, 2, 16, 16, 4], "schedule_rule": "None", "src_layout": "NCHW4c"})
                     T_layout_trans[ax0, ax1, ax2, ax3] = T.if_then_else(ax0 < T.int64(1) and ax1 < T.int64(8) and ax2 < T.int64(16) and ax3 < T.int64(16), placeholder[ax0, ax1 // T.int64(4), ax2, ax3, ax1 % T.int64(4)], T.float32(0), dtype="float32") # type: ignore
 
     @tvm.script.ir_module

From 5546079ce637a93aa0f14ffa688715fdd3d74525 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Thu, 16 Mar 2023 10:49:52 -0700
Subject: [PATCH 27/40] linting

---
 .../schedule/cuda/layout_transform.py         | 76 ++++++++++++-------
 .../tvm/meta_schedule/schedule/cuda/test.py   | 25 ------
 python/tvm/relay/op/strategy/cuda.py          |  2 +-
 .../postproc/rewrite_cooperative_fetch.cc     |  8 +-
 ...schedule_schedule_cuda_layout_transform.py | 17 +++++
 5 files changed, 71 insertions(+), 57 deletions(-)
 delete mode 100644 python/tvm/meta_schedule/schedule/cuda/test.py

diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
index aa2378443602..8cee6ea51162 100644
--- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
+++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
@@ -1,9 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""layout_transform scheduling rule for cuda."""
+
 import math
 from collections import deque
 from typing import List, Optional, Sequence, Tuple, Union
 
 import tvm
-from tvm import meta_schedule, topi
+from tvm import meta_schedule
 from tvm.tir.schedule import BlockRV, ExprRV, LoopRV
 
 ## Tiling layout transforms:
@@ -62,7 +80,7 @@
 # T.grid(64_ar, 1_br, 8_cr, 1_dr, 32_dim0, 32_dim1)
 #
 # Which allows us to read a tile with our wanted properties.
-# For writing we use the existing analysis infrastructure to generate the proper structure for writing.
+# For writing we use the existing analysis infrastructure to generate the structure for writing.
 
 
 def tile_layout_transform(
@@ -110,8 +128,8 @@ def tile_layout_transform(
 
     def pad_dimension_to_at_least_number(loop: LoopRV, requested_size: int):
         """E.g. if loop has extant of 8 but we want 10, returns size 10 loop with padding."""
-        l1, l2 = sch.split(loop, [None, requested_size])
-        return sch.fuse(l1, l2)
+        left, right = sch.split(loop, [None, requested_size])
+        return sch.fuse(left, right)
 
     def pad_dimension_to_factor_of_tile_size(
         loop: LoopRV, initial_size: int, tile_size: int = tile_size
@@ -213,7 +231,7 @@ def factor_dim_in_order(
         return loops, cur_loop_extants
 
     def get_high_level_loop_structure(
-        block_read: BlockRV, input_shape: Sequence[int], src_layout: str, dst_layout: str
+        block_read: BlockRV, input_shape: List[int], src_layout: str, dst_layout: str
     ):
         """Runs the factorization described above."""
         # index 0 ... rank - 1 will always correspond to original loops
@@ -245,9 +263,11 @@ def get_high_level_loop_structure(
         # Same thing with dim1
         # [:rank + 1], since we placed dim0_loop_tiled in the end which we want to keep
         loops, cur_loop_extants = factor_dim_in_order(
-            (
-                src_layout.index(dst_layout[loop_index_dst])
-                for loop_index_dst in range(rank - 1, -1, -1)
+            list(
+                (
+                    src_layout.index(dst_layout[loop_index_dst])
+                    for loop_index_dst in range(rank - 1, -1, -1)
+                )
             ),
             loops,
             cur_loop_extants,
@@ -307,14 +327,15 @@ def create_cached_read(
     orig_input_shape: Sequence[int],
     orig_src_layout: str,
     orig_dst_layout: str,
-) -> Tuple[List[int], str, str]:
+) -> Tuple[BlockRV, List[int], str, str]:
     """
     Makes layout transform schedule applicable to implicit reshape case.
 
-    Layout transform allows semantics like NCHW --> NCHW4c. Which involves splitting the original C axis into contiguous
-    4-element chunks. This axis is then moved to the end (NCHWc). This is guaranteed by the operator to be done without
-    additional padding. To handle this we just split the associating axis (prev. type checking ensures C is divisible by 4)
-    in src_layout found in block_read. E.g. NCHW -> NCHW4c now becomes NC4cHW -> NCHW4c.
+    Layout transform allows semantics like NCHW --> NCHW4c. Which involves splitting the original C
+    axis into contiguous 4-element chunks. This axis is then moved to the end (NCHWc). This is
+    guaranteed by the operator to be done without additional padding. To handle this we just split
+    the associating axis (prev. type checking ensures C is divisible by 4)in src_layout found in
+    block_read. E.g. NCHW -> NCHW4c now becomes NC4cHW -> NCHW4c.
 
     Note: NCHW4c --> NCHW is not allowed, so the only numeric digits will be in dst.
 
@@ -371,8 +392,8 @@ def create_cached_read(
 
     # Calculate final input shapes, each of these are a single element for unsplit dims
     # and tuples for split dims associated with the two new axis
-    input_shape: List[Union[int, Tuple]] = [i for i in orig_input_shape]
-    new_src_layout: List[Union[str, Tuple]] = [c for c in orig_src_layout]
+    input_shape: List[Union[int, Tuple]] = list(orig_input_shape)
+    new_src_layout: List[Union[str, Tuple]] = list(orig_src_layout)
     for src_layout_split_index, split_factor in split_dimensions:
         dimension_name = new_src_layout[src_layout_split_index]
         new_src_layout[src_layout_split_index] = (dimension_name, dimension_name.lower())
@@ -382,8 +403,8 @@ def create_cached_read(
         )
 
     # Unpack any tuples introduced via appending
-    def unpack_list(target_list) -> list:
-        output = []
+    def unpack_list(target_list) -> List:
+        output: List = []
         for ele in target_list:
             if isinstance(ele, tuple):
                 output.extend(ele)
@@ -433,26 +454,27 @@ def auto_inline_into(sch: tvm.tir.Schedule, start_block: BlockRV) -> BlockRV:
     ret:
         The new block inlined into it's consumers.
     """
+    # Rules defined by DefaultCUDA schedule_rule set.
+    autoinline_rule = meta_schedule.schedule_rule.AutoInline(
+        into_producer=True,
+        into_consumer=False,
+        inline_const_tensor=True,
+        disallow_if_then_else=False,
+        require_injective=False,
+        require_ordered=False,
+    )
+
     fringe = deque(sch.get_consumers(start_block))
     visited = set()
     while len(fringe) > 0:
         cur_block = fringe.popleft()
         if cur_block in visited:
             continue
-        else:
-            visited.add(cur_block)
 
+        visited.add(cur_block)
         consumer_blocks = sch.get_consumers(cur_block)
         fringe.extend(consumer_blocks)
 
-        autoinline_rule = meta_schedule.schedule_rule.AutoInline(
-            into_producer=True,
-            into_consumer=False,
-            inline_const_tensor=True,
-            disallow_if_then_else=False,
-            require_injective=False,
-            require_ordered=False,
-        )
         sch = autoinline_rule.apply(sch, cur_block)[0]
 
 
diff --git a/python/tvm/meta_schedule/schedule/cuda/test.py b/python/tvm/meta_schedule/schedule/cuda/test.py
deleted file mode 100644
index 33d56361f491..000000000000
--- a/python/tvm/meta_schedule/schedule/cuda/test.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Read step
-print("read")
-for ax0_0_0_ax0_1_0_ax0_1_1_fused_0_fused in range(2):
-    for ax0_1_0_ax0_1_1_fused_1_fused_0_ax0_1_0_ax0_1_1_fused_1_fused_1_fused in range(3):
-        for ax0_0_1_fused_0_ax0_0_1_fused_1_fused in range(3):
-            v0 = (
-                ax0_0_0_ax0_1_0_ax0_1_1_fused_0_fused * (3) + ax0_0_1_fused_0_ax0_0_1_fused_1_fused
-            ) * (2) + ax0_1_0_ax0_1_1_fused_1_fused_0_ax0_1_0_ax0_1_1_fused_1_fused_1_fused
-            print(
-                f"block:{ax0_0_0_ax0_1_0_ax0_1_1_fused_0_fused} thread:{ax0_1_0_ax0_1_1_fused_1_fused_0_ax0_1_0_ax0_1_1_fused_1_fused_1_fused} -- index: {v0}"
-            )
-
-
-# Write step
-print("write")
-for ax0_0_0_ax0_1_0_ax0_1_1_fused_0_fused in range(2):
-    for ax0_ax1_fused_0 in range(3):
-        for ax0_ax1_fused_1 in range(3):
-            v_ax0 = (ax0_ax1_fused_0 * 3 + ax0_ax1_fused_1) // (4)
-            v_ax1 = ax0_0_0_ax0_1_0_ax0_1_1_fused_0_fused * (3) + (
-                ax0_ax1_fused_0 * (3) + ax0_ax1_fused_1
-            ) % (4)
-            print(
-                f"block:{ax0_0_0_ax0_1_0_ax0_1_1_fused_0_fused} thread:{ax0_ax1_fused_1}-- index: {v_ax0}, {v_ax1}"
-            )
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 0c39d12f318d..65573321f76c 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -1399,7 +1399,7 @@ def dft_strategy_cuda(attrs, inputs, out_type, target):
 
 
 @layout_transform_strategy.register(["cuda", "gpu"])
-def layout_transform_strategy(attrs, inputs, out_type, target):
+def layout_transform_strategy_cuda(attrs, inputs, out_type, target):
     strategy = _op.OpStrategy()
     strategy.add_implementation(
         wrap_compute_layout_transform(topi.layout_transform, schedule_rule="layout_transform"),
diff --git a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
index 1cf0d893d3e6..8cdb6a1e92a6 100644
--- a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
+++ b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
@@ -44,10 +44,10 @@ Optional<Integer> ParseThreadBinding(const Schedule& sch, const Instruction& ins
     return Downcast<Integer>(sch->Get(Downcast<LoopRV>(inst->inputs[0]))->extent);
   } catch (const std::exception& e) {
     // This can occur if in a schedule we manually bind threads in the middle of a schedule
-    // and then later modify the schedule. As the passed in schedule is after running the entire trace
-    // the bound loop may be moved around in the IRModule.
-    // TODO: apply trace one inst at a time so schedule state is always accurate to instruction
-    LOG(DEBUG) << "Failed to calculate extent so skipping RewriteCooperativeFetching. Error " << e.what();
+    // and then later modify the schedule. As the passed in schedule is after running the entire
+    // trace the bound loop may be moved around in the IRModule.
+    LOG(DEBUG) << "Failed to calculate extent so skipping RewriteCooperativeFetching. Error "
+               << e.what();
     return NullOpt;
   }
 }
diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
index 4058a7644548..35dd454b9195 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 # Edge Cases:
 # 1. Fusion with ops
 # 2. Fusion with ops

From a3729c9c24ab2b5cc346f3f77d9924664466bf63 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Thu, 16 Mar 2023 11:20:21 -0700
Subject: [PATCH 28/40] fix mypy errors

---
 .../schedule/cuda/layout_transform.py         | 29 +++++++++----------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
index 8cee6ea51162..2c9b87790dbb 100644
--- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
+++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
@@ -18,7 +18,7 @@
 
 import math
 from collections import deque
-from typing import List, Optional, Sequence, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import tvm
 from tvm import meta_schedule
@@ -212,11 +212,11 @@ def spin_out_factor(
         return loops, loop_extants, factor_needed
 
     def factor_dim_in_order(
-        indices: Sequence[int],
+        indices: List[int],
         loops: List[LoopRV],
         cur_loop_extants: List[int],
         work_needed_inner_loop: int = tile_size,
-    ) -> Tuple[List[LoopRV], Sequence[int]]:
+    ) -> Tuple[List[LoopRV], List[int]]:
         """Factors out the loops in the order of indices until we reach needed work.
 
         Adds new loop factors to the back in reverse order of access. Returns new list
@@ -242,7 +242,7 @@ def get_high_level_loop_structure(
 
         # Factor dim0 tile size and fuse things together
         loops, cur_loop_extants = factor_dim_in_order(
-            range(rank - 1, -1, -1),
+            list(range(rank - 1, -1, -1)),
             loops,
             cur_loop_extants,
             work_needed_inner_loop=tile_size,
@@ -324,7 +324,7 @@ def get_high_level_loop_structure(
 def create_cached_read(
     sch: tvm.tir.Schedule,
     block_write: BlockRV,
-    orig_input_shape: Sequence[int],
+    orig_input_shape: List[int],
     orig_src_layout: str,
     orig_dst_layout: str,
 ) -> Tuple[BlockRV, List[int], str, str]:
@@ -395,10 +395,10 @@ def create_cached_read(
     input_shape: List[Union[int, Tuple]] = list(orig_input_shape)
     new_src_layout: List[Union[str, Tuple]] = list(orig_src_layout)
     for src_layout_split_index, split_factor in split_dimensions:
-        dimension_name = new_src_layout[src_layout_split_index]
+        dimension_name = orig_src_layout[src_layout_split_index]
         new_src_layout[src_layout_split_index] = (dimension_name, dimension_name.lower())
         input_shape[src_layout_split_index] = (
-            input_shape[src_layout_split_index] // split_factor,
+            orig_input_shape[src_layout_split_index] // split_factor,
             split_factor,
         )
 
@@ -412,18 +412,17 @@ def unpack_list(target_list) -> List:
                 output.append(ele)
         return output
 
-    new_src_layout = unpack_list(new_src_layout)
-    new_src_layout = "".join(new_src_layout)
-    new_dst_layout = "".join(new_dst_layout)
+    new_src_layout_str = "".join(unpack_list(new_src_layout))
+    new_dst_layout_str = "".join(unpack_list(new_dst_layout))
 
     # Write block loop extants match
-    reindex_map = [new_src_layout.index(dim) for dim in new_dst_layout]
+    reindex_map = [new_src_layout_str.index(dim) for dim in new_dst_layout_str]
     block_read = sch.reindex_cache_read(
         block_write,
         read_buffer_index=0,
         index_map=tvm.tir.IndexMap.from_func(
             lambda *loops: [loops[reindex_map[i]] for i, _ in enumerate(loops)],
-            ndim=len(new_src_layout),
+            ndim=len(new_src_layout_str),
         ),
         storage_scope="shared",
     )
@@ -431,9 +430,9 @@ def unpack_list(target_list) -> List:
     # While the above will have the shared memory buffer match the reshaped input tensor
     # the loops still match those of the write/output loop/buffer. Match the src layout instead
     loops_read = sch.get_loops(block_read)
-    sch.reorder(*[loops_read[reindex_map[i]] for i, _ in enumerate(new_dst_layout)])
+    sch.reorder(*[loops_read[reindex_map[i]] for i, _ in enumerate(new_dst_layout_str)])
 
-    return block_read, unpack_list(input_shape), new_src_layout, new_dst_layout
+    return block_read, unpack_list(input_shape), new_src_layout_str, new_dst_layout_str
 
 
 def auto_inline_into(sch: tvm.tir.Schedule, start_block: BlockRV) -> BlockRV:
@@ -559,7 +558,7 @@ def cuda_layout_transform_schedule_rule(
 
     # Try tile size 2,3...threads_per_warp as tile size of 1 has no coaslescing.
     if testing_tile_sizes is None:
-        tile_sizes = range(2, get_max_tile_size() + 1)
+        tile_sizes = list(range(2, get_max_tile_size() + 1))
     else:
         tile_sizes = testing_tile_sizes
 

From bd5707745a80faeaf3e99a9a7827ae0f2548f4e6 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Thu, 16 Mar 2023 11:28:52 -0700
Subject: [PATCH 29/40] clean up

---
 .../schedule/cuda/layout_transform.py         | 29 ++++++++++++-------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
index 2c9b87790dbb..7f0c8a1a7e82 100644
--- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
+++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
@@ -124,6 +124,12 @@ def tile_layout_transform(
     tile_size:
         The tile size of read and writes. There will be tile_size threads per block, each of which
         reads up to tile_size elements.
+
+    Returns
+    -------
+    ret:
+        A tuple of the block that writes to global memory, and the block that reads from
+        global memory.
     """
 
     def pad_dimension_to_at_least_number(loop: LoopRV, requested_size: int):
@@ -329,7 +335,11 @@ def create_cached_read(
     orig_dst_layout: str,
 ) -> Tuple[BlockRV, List[int], str, str]:
     """
-    Makes layout transform schedule applicable to implicit reshape case.
+    Creates the cached read block with expected structure.
+
+    Loop extants should follow the input shape closely. E.g. if the input is [2, 6, 8], we
+    expect our loop structure to be T.grid(2, 6, 8). Possibly reshape to handle implicit reshapes,
+    in which case we will match the implicit reshape shape.
 
     Layout transform allows semantics like NCHW --> NCHW4c. Which involves splitting the original C
     axis into contiguous 4-element chunks. This axis is then moved to the end (NCHWc). This is
@@ -363,8 +373,8 @@ def create_cached_read(
     Returns
     -------
     ret:
-        A tuple of the new input shape of shared memory buffer, the new src_layout and
-        new dst_layout string.
+        A tuple of the cached read block, new input shape of shared memory buffer,
+        the new src_layout, and new dst_layout string.
     """
     # Figure out split dimensions, entries are (loop index in src_layout, split amount)
     split_dimensions: List[Tuple[int, int]] = []
@@ -431,7 +441,6 @@ def unpack_list(target_list) -> List:
     # the loops still match those of the write/output loop/buffer. Match the src layout instead
     loops_read = sch.get_loops(block_read)
     sch.reorder(*[loops_read[reindex_map[i]] for i, _ in enumerate(new_dst_layout_str)])
-
     return block_read, unpack_list(input_shape), new_src_layout_str, new_dst_layout_str
 
 
@@ -529,14 +538,15 @@ def cuda_layout_transform_schedule_rule(
     schedules = []
 
     # Always include the default schedules which will be handled via AutoBind schedule rule
+    # Except during testing
     if not testing_tile_sizes:
         schedules.append(sch)
+
     sch = sch.copy()
 
     # Inline consumers of the layout transform into the layout transform block.
     # Normally default for injective schedules but must manually be called in new schedule rule
-    # as we introduce a new block under the custom schedule rule which is not taken into account
-    # during search space generation. TODO: rectify this.
+    # for consumers of the layout transform. TODO(AndrewZhaoLuo): Figure out why this is the case.
     auto_inline_into(sch, block)
 
     # Setup up basic structure of schedule of creating read into shared mem, before applying tiling
@@ -546,12 +556,11 @@ def cuda_layout_transform_schedule_rule(
     #     ...
     # Read block will read from global memory coalesced at the start
     # Assume write to output global memory is coalesced in block_write
-    # block_read = sch.cache_read(block, 0, "shared")
-
-    # Handle the case where there is an implicit reshape going on.
+    #
+    # This also handles the case where there is an implicit reshape going on.
     # e.g. NCHW -> NCHW4c which is equivalent to reshaping NCHW
     # to NCcHW and then applying the new layout where the extant of c is 4.
-    # Grab final input shape and src and dst layouts.
+    # Grab final input shape and src and dst layouts with possible implicit reshape.
     block_read, input_shape, src_layout, dst_layout = create_cached_read(
         sch, block, input_shape, src_layout, dst_layout
     )

From 7eb21ad1b7697f85729eea2e187ce234681a13f0 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Thu, 16 Mar 2023 12:25:07 -0700
Subject: [PATCH 30/40] manual test cases

---
 .../schedule/cuda/layout_transform.py         |  10 +-
 ...schedule_schedule_cuda_layout_transform.py | 160 +++++++++++++++++-
 2 files changed, 157 insertions(+), 13 deletions(-)

diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
index 7f0c8a1a7e82..c51c8ed60f77 100644
--- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
+++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
@@ -426,21 +426,21 @@ def unpack_list(target_list) -> List:
     new_dst_layout_str = "".join(unpack_list(new_dst_layout))
 
     # Write block loop extants match
-    reindex_map = [new_src_layout_str.index(dim) for dim in new_dst_layout_str]
+    dst_to_src_map = [new_src_layout_str.index(dim) for dim in new_dst_layout_str]
     block_read = sch.reindex_cache_read(
         block_write,
         read_buffer_index=0,
         index_map=tvm.tir.IndexMap.from_func(
-            lambda *loops: [loops[reindex_map[i]] for i, _ in enumerate(loops)],
+            lambda *loops: [loops[dst_to_src_map[i]] for i, _ in enumerate(loops)],
             ndim=len(new_src_layout_str),
         ),
         storage_scope="shared",
     )
 
-    # While the above will have the shared memory buffer match the reshaped input tensor
-    # the loops still match those of the write/output loop/buffer. Match the src layout instead
     loops_read = sch.get_loops(block_read)
-    sch.reorder(*[loops_read[reindex_map[i]] for i, _ in enumerate(new_dst_layout_str)])
+    sch.reorder(
+        *[loops_read[new_dst_layout_str.index(dst_dim_name)] for dst_dim_name in new_src_layout_str]
+    )
     return block_read, unpack_list(input_shape), new_src_layout_str, new_dst_layout_str
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
index 35dd454b9195..881c959050b0 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
@@ -24,7 +24,6 @@
 # 2. Correctness when running
 # 3. Autotuning ability
 
-
 import itertools
 import random
 import tempfile
@@ -38,8 +37,15 @@
 from tvm import meta_schedule, relay
 from tvm.meta_schedule.schedule.cuda.layout_transform import cuda_layout_transform_schedule_rule
 from tvm.relay.op import OpPattern
+from tvm.script import ir as I
+from tvm.script import tir as T
 from tvm.tir.schedule import BlockRV, ExprRV, LoopRV
 
+# Small gpu parameters which should work for nearly every (modern-ish) gpu.
+TARGET = tvm.target.Target(
+    "cuda -max_threads_per_block=32 -max_num_threads=128 -thread_warp_size=32 -max_shared_memory_per_block=8192 -registers_per_block=1024"
+)
+
 
 class PatchCustomLayoutTransformScheduleRule:
     """Patch the custom layout transform schedule to test only specific tile sizes.
@@ -327,19 +333,157 @@ def test_all_test_case(
         num_additional_ops,
     ):
         """Tests the product of all conditions `repeat_per_condition` times."""
-        # Small gpu parameters which should work for nearly every (modern-ish) gpu.
-        target = tvm.target.Target(
-            "cuda -max_threads_per_block=32 -max_num_threads=128 -thread_warp_size=32 -max_shared_memory_per_block=8192 -registers_per_block=1024"
-        )
-
         # Generate random module of fusable ops + layout transform and extract fused layout transform task
         full_mod = self.generate_test_case(
             input_shape, implicit_reshape_info, dtype, num_additional_ops
         )
 
         # Fused layout transform task
-        extracted_task = self.extract_layout_transform_task(full_mod, target)
-        self.verify_layout_transform_task(extracted_task, target, tile_sizes)
+        extracted_task = self.extract_layout_transform_task(full_mod, TARGET)
+        self.verify_layout_transform_task(extracted_task, TARGET, tile_sizes)
+
+
+@tvm.testing.requires_gpu
+class TestManualCases:
+    def assert_extracted_equals_expected(
+        self, relay_mod: tvm.IRModule, expected_mod: tvm.IRModule, tile_size: int
+    ):
+        extracted_task = TestRandomRelayE2ECorrectness.extract_layout_transform_task(
+            relay_mod, TARGET
+        )
+        dispatched_mod = extracted_task.dispatched[0]
+        sch = tvm.tir.Schedule(dispatched_mod)
+        block = sch.get_block("T_layout_trans")
+        output_sch = cuda_layout_transform_schedule_rule(sch, block, [tile_size])[0]
+        assert output_sch.mod.script() == expected_mod.script()
+
+    def ntest_simple_tiling(self):
+        mod = TestRandomRelayE2ECorrectness.create_relay_module(
+            [1, 32, 32, 32], "float16", [("NCHW", "NHWC")]
+        )
+
+        # Main things to notice:
+        # - two blocks each with 16, 16 extents which write/read shared mem
+        # - coalesced accesses in inner loop of global memory buffer for both
+        # fmt: off
+        @I.ir_module
+        class ExpectedModule:
+            @T.prim_func
+            def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "float16"), T_layout_trans: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "float16")):
+                T.func_attr({"global_symbol": "main", "tir.noalias": True})
+                # with T.block("root"):
+                p0_shared = T.alloc_buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "float16", scope="shared")
+                for ax0_ax2_ax1_0_ax3_0_fused in T.thread_binding(T.int64(128), thread="blockIdx.x"):
+                    for ax3_1_fused_0_ax3_1_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.x"):
+                        for ax1_1_fused_0_ax1_1_fused_1_fused in range(T.int64(16)):
+                            with T.block("p0_shared"):
+                                v0 = T.axis.spatial(T.int64(1), T.int64(0))
+                                v1 = T.axis.spatial(T.int64(32), ax0_ax2_ax1_0_ax3_0_fused % T.int64(4) // T.int64(2) * T.int64(16) + ax1_1_fused_0_ax1_1_fused_1_fused)
+                                v2 = T.axis.spatial(T.int64(32), ax0_ax2_ax1_0_ax3_0_fused // T.int64(4))
+                                v3 = T.axis.spatial(T.int64(32), ax0_ax2_ax1_0_ax3_0_fused % T.int64(2) * T.int64(16) + ax3_1_fused_0_ax3_1_fused_1_fused)
+                                T.reads(p0[v0, v1, v2, v3])
+                                T.writes(p0_shared[v0, v1, v2, v3])
+                                p0_shared[v0, v1, v2, v3] = p0[v0, v1, v2, v3]
+                    for ax0_ax1_fused_0 in range(T.int64(16)):
+                        for ax0_ax1_fused_1 in T.thread_binding(T.int64(16), thread="threadIdx.x"):
+                            with T.block("T_layout_trans"):
+                                v_ax0 = T.axis.spatial(T.int64(1), T.int64(0))
+                                v_ax1 = T.axis.spatial(T.int64(32), ax0_ax2_ax1_0_ax3_0_fused // T.int64(4))
+                                v_ax2 = T.axis.spatial(T.int64(32), ax0_ax2_ax1_0_ax3_0_fused % T.int64(2) * T.int64(16) + (ax0_ax1_fused_0 * T.int64(16) + ax0_ax1_fused_1) // T.int64(16))
+                                v_ax3 = T.axis.spatial(T.int64(32), ax0_ax2_ax1_0_ax3_0_fused % T.int64(4) // T.int64(2) * T.int64(16) + (ax0_ax1_fused_0 * T.int64(16) + ax0_ax1_fused_1) % T.int64(16))
+                                T.reads(p0_shared[v_ax0, v_ax3, v_ax1, v_ax2])
+                                T.writes(T_layout_trans[v_ax0, v_ax1, v_ax2, v_ax3])
+                                T.block_attr({"dst_layout": "NHWC", "input_shape": [1, 32, 32, 32], "schedule_rule": "layout_transform", "src_layout": "NCHW"})
+                                T_layout_trans[v_ax0, v_ax1, v_ax2, v_ax3] = T.if_then_else(v_ax0 < T.int64(1) and v_ax3 < T.int64(32) and v_ax1 < T.int64(32) and v_ax2 < T.int64(32), p0_shared[v_ax0, v_ax3, v_ax1, v_ax2], T.float16(0))
+        
+        self.assert_extracted_equals_expected(mod, ExpectedModule, 16)
+
+    def test_simple_implicit_reshape(self):
+        mod = TestRandomRelayE2ECorrectness.create_relay_module(
+            [1, 32, 32, 32], "float16", [("NCHW", "NCHW4c")]
+        )
+
+        # Main things to notice:
+        # - two blocks each with 16, 16 extents which write/read shared mem
+        # - coalesced accesses in inner loop of global memory buffer for both
+        # - an implicit reshape is done (see p0_shared)
+        # fmt: off
+        @I.ir_module
+        class ExpectedModule:
+            @T.prim_func
+            def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "float16"), T_layout_trans: T.Buffer((T.int64(1), T.int64(8), T.int64(32), T.int64(32), T.int64(4)), "float16")):
+                T.func_attr({"global_symbol": "main", "tir.noalias": True})
+                # with T.block("root"):
+                p0_shared = T.alloc_buffer((T.int64(1), T.int64(8), T.int64(32), T.int64(4), T.int64(32)), "float16", scope="shared")
+                for ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused in T.thread_binding(T.int64(128), thread="blockIdx.x"):
+                    for ax3_1_fused_0_ax3_1_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.x"):
+                        for ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused in range(T.int64(16)):
+                            with T.block("p0_shared"):
+                                v_ax0 = T.axis.spatial(T.int64(1), T.int64(0))
+                                v_ax1 = T.axis.spatial(T.int64(8), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused // T.int64(16))
+                                v_ax2 = T.axis.spatial(T.int64(32), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused % T.int64(16) * T.int64(2) + ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused // T.int64(8))
+                                v_ax3 = T.axis.spatial(T.int64(32), ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused % T.int64(8) // T.int64(4) * T.int64(16) + ax3_1_fused_0_ax3_1_fused_1_fused)
+                                v_ax4 = T.axis.spatial(T.int64(4), ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused % T.int64(4))
+                                T.reads(p0[v_ax0, v_ax1 * T.int64(4) + v_ax4, v_ax2, v_ax3])
+                                T.writes(p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2])
+                                p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2] = p0[v_ax0, v_ax1 * T.int64(4) + v_ax4, v_ax2, v_ax3]
+                    for ax0_ax1_ax2_fused_0 in range(T.int64(16)):
+                        for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(16), thread="threadIdx.x"):
+                            with T.block("T_layout_trans"):
+                                v_ax0 = T.axis.spatial(T.int64(1), T.int64(0))
+                                v_ax1 = T.axis.spatial(T.int64(8), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused // T.int64(16))
+                                v_ax2 = T.axis.spatial(T.int64(32), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused % T.int64(16) * T.int64(2) + (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) // T.int64(128))
+                                v_ax3 = T.axis.spatial(T.int64(32), (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) % T.int64(128) // T.int64(4))
+                                v_ax4 = T.axis.spatial(T.int64(4), (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) % T.int64(4))
+                                T.reads(p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2])
+                                T.writes(T_layout_trans[v_ax0, v_ax1, v_ax2, v_ax3, v_ax4])
+                                T.block_attr({"dst_layout": "NCHW4c", "input_shape": [1, 32, 32, 32], "schedule_rule": "layout_transform", "src_layout": "NCHW"})
+                                T_layout_trans[v_ax0, v_ax1, v_ax2, v_ax3, v_ax4] = T.if_then_else(v_ax0 < T.int64(1) and v_ax1 * T.int64(4) + v_ax4 < T.int64(32) and v_ax2 < T.int64(32) and v_ax3 < T.int64(32), p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2], T.float16(0))
+        self.assert_extracted_equals_expected(mod, ExpectedModule, 16)
+
+    def test_expected_fusion_post(self):
+        mod = TestRandomRelayE2ECorrectness.create_relay_module(
+            [1, 32, 32, 32], "float16", [("NCHW", "NCHW4c"), OpPattern.BROADCAST]
+        )
+
+        # Main things to notice:
+        # - two blocks each with 16, 16 extents which write/read shared mem
+        # - coalesced accesses in inner loop of global memory buffer for both
+        # - an implicit reshape is done (see p0_shared)
+        # - an addition is inlined in the final block (p1 input)
+        # fmt: off
+        @I.ir_module
+        class ExpectedModule:
+            @T.prim_func
+            def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "float16"), p1: T.Buffer((), "float16"), T_add: T.Buffer((T.int64(1), T.int64(8), T.int64(32), T.int64(32), T.int64(4)), "float16")):
+                T.func_attr({"global_symbol": "main", "tir.noalias": True})
+                # with T.block("root"):
+                p0_shared = T.alloc_buffer((T.int64(1), T.int64(8), T.int64(32), T.int64(4), T.int64(32)), "float16", scope="shared")
+                for ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused in T.thread_binding(T.int64(128), thread="blockIdx.x"):
+                    for ax3_1_fused_0_ax3_1_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.x"):
+                        for ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused in range(T.int64(16)):
+                            with T.block("p0_shared"):
+                                v_ax0 = T.axis.spatial(T.int64(1), T.int64(0))
+                                v_ax1 = T.axis.spatial(T.int64(8), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused // T.int64(16))
+                                v_ax2 = T.axis.spatial(T.int64(32), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused % T.int64(16) * T.int64(2) + ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused // T.int64(8))
+                                v_ax3 = T.axis.spatial(T.int64(32), ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused % T.int64(8) // T.int64(4) * T.int64(16) + ax3_1_fused_0_ax3_1_fused_1_fused)
+                                v_ax4 = T.axis.spatial(T.int64(4), ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused % T.int64(4))
+                                T.reads(p0[v_ax0, v_ax1 * T.int64(4) + v_ax4, v_ax2, v_ax3])
+                                T.writes(p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2])
+                                p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2] = p0[v_ax0, v_ax1 * T.int64(4) + v_ax4, v_ax2, v_ax3]
+                    for ax0_ax1_ax2_fused_0 in range(T.int64(16)):
+                        for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(16), thread="threadIdx.x"):
+                            with T.block("T_layout_trans"):
+                                v_ax0 = T.axis.spatial(T.int64(1), T.int64(0))
+                                v_ax1 = T.axis.spatial(T.int64(8), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused // T.int64(16))
+                                v_ax2 = T.axis.spatial(T.int64(32), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused % T.int64(16) * T.int64(2) + (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) // T.int64(128))
+                                v_ax3 = T.axis.spatial(T.int64(32), (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) % T.int64(128) // T.int64(4))
+                                v_ax4 = T.axis.spatial(T.int64(4), (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) % T.int64(4))
+                                T.reads(p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2], p1[()])
+                                T.writes(T_add[v_ax0, v_ax1, v_ax2, v_ax3, v_ax4])
+                                T.block_attr({"dst_layout": "NCHW4c", "input_shape": [1, 32, 32, 32], "schedule_rule": "layout_transform", "src_layout": "NCHW"})
+                                T_add[v_ax0, v_ax1, v_ax2, v_ax3, v_ax4] = T.if_then_else(v_ax0 < T.int64(1) and v_ax1 * T.int64(4) + v_ax4 < T.int64(32) and v_ax2 < T.int64(32) and v_ax3 < T.int64(32), p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2], T.float16(0)) + p1[()]
+        self.assert_extracted_equals_expected(mod, ExpectedModule, 16)
 
 
 if __name__ == "__main__":

From 3d092fe9a53150f269539bd951e4c95b5cd93b73 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Thu, 16 Mar 2023 12:27:48 -0700
Subject: [PATCH 31/40] manual tests

---
 .../test_meta_schedule_schedule_cuda_layout_transform.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
index 881c959050b0..827b9fcaa420 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
@@ -357,7 +357,7 @@ def assert_extracted_equals_expected(
         output_sch = cuda_layout_transform_schedule_rule(sch, block, [tile_size])[0]
         assert output_sch.mod.script() == expected_mod.script()
 
-    def ntest_simple_tiling(self):
+    def test_simple_tiling(self):
         mod = TestRandomRelayE2ECorrectness.create_relay_module(
             [1, 32, 32, 32], "float16", [("NCHW", "NHWC")]
         )

From 7d4df3f9ece6d310eaa5dd6d732079ff733a1ac7 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Thu, 16 Mar 2023 12:39:40 -0700
Subject: [PATCH 32/40] add comment, fix improper implicit reshape handling

---
 python/tvm/meta_schedule/schedule/cuda/layout_transform.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
index c51c8ed60f77..949ef915c9ff 100644
--- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
+++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py
@@ -352,6 +352,8 @@ def create_cached_read(
     The returned layout strings will be santized and made compatible. E.g. NCHW --> NCHW4c becomes
     NCcHW --> NCHWc.
 
+    TODO(AndrewZhaoLuo): Investigate using proper memory alignment to avoid bank conflict.
+
     Parameters
     ----------
     sch:
@@ -426,7 +428,7 @@ def unpack_list(target_list) -> List:
     new_dst_layout_str = "".join(unpack_list(new_dst_layout))
 
     # Write block loop extants match
-    dst_to_src_map = [new_src_layout_str.index(dim) for dim in new_dst_layout_str]
+    dst_to_src_map = [new_dst_layout_str.index(dim) for dim in new_src_layout_str]
     block_read = sch.reindex_cache_read(
         block_write,
         read_buffer_index=0,

From 37a0e9d791557447de634972eac3642645fd2600 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Thu, 16 Mar 2023 12:39:47 -0700
Subject: [PATCH 33/40] fix

---
 ...schedule_schedule_cuda_layout_transform.py | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
index 827b9fcaa420..7000fbab1d0e 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
@@ -414,7 +414,7 @@ class ExpectedModule:
             def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "float16"), T_layout_trans: T.Buffer((T.int64(1), T.int64(8), T.int64(32), T.int64(32), T.int64(4)), "float16")):
                 T.func_attr({"global_symbol": "main", "tir.noalias": True})
                 # with T.block("root"):
-                p0_shared = T.alloc_buffer((T.int64(1), T.int64(8), T.int64(32), T.int64(4), T.int64(32)), "float16", scope="shared")
+                p0_shared = T.alloc_buffer((T.int64(1), T.int64(8), T.int64(4), T.int64(32), T.int64(32)), "float16", scope="shared")
                 for ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused in T.thread_binding(T.int64(128), thread="blockIdx.x"):
                     for ax3_1_fused_0_ax3_1_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.x"):
                         for ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused in range(T.int64(16)):
@@ -425,8 +425,8 @@ def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "floa
                                 v_ax3 = T.axis.spatial(T.int64(32), ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused % T.int64(8) // T.int64(4) * T.int64(16) + ax3_1_fused_0_ax3_1_fused_1_fused)
                                 v_ax4 = T.axis.spatial(T.int64(4), ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused % T.int64(4))
                                 T.reads(p0[v_ax0, v_ax1 * T.int64(4) + v_ax4, v_ax2, v_ax3])
-                                T.writes(p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2])
-                                p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2] = p0[v_ax0, v_ax1 * T.int64(4) + v_ax4, v_ax2, v_ax3]
+                                T.writes(p0_shared[v_ax0, v_ax1, v_ax4, v_ax2, v_ax3])
+                                p0_shared[v_ax0, v_ax1, v_ax4, v_ax2, v_ax3] = p0[v_ax0, v_ax1 * T.int64(4) + v_ax4, v_ax2, v_ax3]
                     for ax0_ax1_ax2_fused_0 in range(T.int64(16)):
                         for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(16), thread="threadIdx.x"):
                             with T.block("T_layout_trans"):
@@ -435,10 +435,10 @@ def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "floa
                                 v_ax2 = T.axis.spatial(T.int64(32), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused % T.int64(16) * T.int64(2) + (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) // T.int64(128))
                                 v_ax3 = T.axis.spatial(T.int64(32), (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) % T.int64(128) // T.int64(4))
                                 v_ax4 = T.axis.spatial(T.int64(4), (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) % T.int64(4))
-                                T.reads(p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2])
+                                T.reads(p0_shared[v_ax0, v_ax1, v_ax4, v_ax2, v_ax3])
                                 T.writes(T_layout_trans[v_ax0, v_ax1, v_ax2, v_ax3, v_ax4])
                                 T.block_attr({"dst_layout": "NCHW4c", "input_shape": [1, 32, 32, 32], "schedule_rule": "layout_transform", "src_layout": "NCHW"})
-                                T_layout_trans[v_ax0, v_ax1, v_ax2, v_ax3, v_ax4] = T.if_then_else(v_ax0 < T.int64(1) and v_ax1 * T.int64(4) + v_ax4 < T.int64(32) and v_ax2 < T.int64(32) and v_ax3 < T.int64(32), p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2], T.float16(0))
+                                T_layout_trans[v_ax0, v_ax1, v_ax2, v_ax3, v_ax4] = T.if_then_else(v_ax0 < T.int64(1) and v_ax1 * T.int64(4) + v_ax4 < T.int64(32) and v_ax2 < T.int64(32) and v_ax3 < T.int64(32), p0_shared[v_ax0, v_ax1, v_ax4, v_ax2, v_ax3], T.float16(0))
         self.assert_extracted_equals_expected(mod, ExpectedModule, 16)
 
     def test_expected_fusion_post(self):
@@ -458,7 +458,7 @@ class ExpectedModule:
             def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "float16"), p1: T.Buffer((), "float16"), T_add: T.Buffer((T.int64(1), T.int64(8), T.int64(32), T.int64(32), T.int64(4)), "float16")):
                 T.func_attr({"global_symbol": "main", "tir.noalias": True})
                 # with T.block("root"):
-                p0_shared = T.alloc_buffer((T.int64(1), T.int64(8), T.int64(32), T.int64(4), T.int64(32)), "float16", scope="shared")
+                p0_shared = T.alloc_buffer((T.int64(1), T.int64(8), T.int64(4), T.int64(32), T.int64(32)), "float16", scope="shared")
                 for ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused in T.thread_binding(T.int64(128), thread="blockIdx.x"):
                     for ax3_1_fused_0_ax3_1_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.x"):
                         for ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused in range(T.int64(16)):
@@ -469,8 +469,8 @@ def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "floa
                                 v_ax3 = T.axis.spatial(T.int64(32), ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused % T.int64(8) // T.int64(4) * T.int64(16) + ax3_1_fused_0_ax3_1_fused_1_fused)
                                 v_ax4 = T.axis.spatial(T.int64(4), ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused % T.int64(4))
                                 T.reads(p0[v_ax0, v_ax1 * T.int64(4) + v_ax4, v_ax2, v_ax3])
-                                T.writes(p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2])
-                                p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2] = p0[v_ax0, v_ax1 * T.int64(4) + v_ax4, v_ax2, v_ax3]
+                                T.writes(p0_shared[v_ax0, v_ax1, v_ax4, v_ax2, v_ax3])
+                                p0_shared[v_ax0, v_ax1, v_ax4, v_ax2, v_ax3] = p0[v_ax0, v_ax1 * T.int64(4) + v_ax4, v_ax2, v_ax3]
                     for ax0_ax1_ax2_fused_0 in range(T.int64(16)):
                         for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(16), thread="threadIdx.x"):
                             with T.block("T_layout_trans"):
@@ -479,10 +479,10 @@ def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "floa
                                 v_ax2 = T.axis.spatial(T.int64(32), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused % T.int64(16) * T.int64(2) + (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) // T.int64(128))
                                 v_ax3 = T.axis.spatial(T.int64(32), (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) % T.int64(128) // T.int64(4))
                                 v_ax4 = T.axis.spatial(T.int64(4), (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) % T.int64(4))
-                                T.reads(p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2], p1[()])
+                                T.reads(p0_shared[v_ax0, v_ax1, v_ax4, v_ax2, v_ax3], p1[()])
                                 T.writes(T_add[v_ax0, v_ax1, v_ax2, v_ax3, v_ax4])
                                 T.block_attr({"dst_layout": "NCHW4c", "input_shape": [1, 32, 32, 32], "schedule_rule": "layout_transform", "src_layout": "NCHW"})
-                                T_add[v_ax0, v_ax1, v_ax2, v_ax3, v_ax4] = T.if_then_else(v_ax0 < T.int64(1) and v_ax1 * T.int64(4) + v_ax4 < T.int64(32) and v_ax2 < T.int64(32) and v_ax3 < T.int64(32), p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2], T.float16(0)) + p1[()]
+                                T_add[v_ax0, v_ax1, v_ax2, v_ax3, v_ax4] = T.if_then_else(v_ax0 < T.int64(1) and v_ax1 * T.int64(4) + v_ax4 < T.int64(32) and v_ax2 < T.int64(32) and v_ax3 < T.int64(32), p0_shared[v_ax0, v_ax1, v_ax4, v_ax2, v_ax3], T.float16(0)) + p1[()]
         self.assert_extracted_equals_expected(mod, ExpectedModule, 16)
 
 

From 0687e572ced9f1c61161881bb2bb116b9c0b1ea7 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Thu, 16 Mar 2023 12:40:19 -0700
Subject: [PATCH 34/40] remove extra comments

---
 .../test_meta_schedule_schedule_cuda_layout_transform.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
index 7000fbab1d0e..a0d76ef1745f 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
@@ -15,15 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# Edge Cases:
-# 1. Fusion with ops
-# 2. Fusion with ops
-
-# Properties to test for
-# 1. Compiling -- compiles well without crashing
-# 2. Correctness when running
-# 3. Autotuning ability
-
 import itertools
 import random
 import tempfile

From 8945399c124a61ad8aa6388bfa74579324165d2a Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Thu, 16 Mar 2023 12:51:24 -0700
Subject: [PATCH 35/40] more lints

---
 .../test_meta_schedule_schedule_cuda_layout_transform.py       | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
index a0d76ef1745f..3f5fba8e6296 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
@@ -21,7 +21,6 @@
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
-import pytest
 
 import tvm
 import tvm.testing
@@ -30,7 +29,7 @@
 from tvm.relay.op import OpPattern
 from tvm.script import ir as I
 from tvm.script import tir as T
-from tvm.tir.schedule import BlockRV, ExprRV, LoopRV
+from tvm.tir.schedule import BlockRV
 
 # Small gpu parameters which should work for nearly every (modern-ish) gpu.
 TARGET = tvm.target.Target(

From ceb75488da4eb6d2d65c8b306823ba9e1a1b2eac Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Thu, 16 Mar 2023 13:22:02 -0700
Subject: [PATCH 36/40] refactor

---
 ...schedule_schedule_cuda_layout_transform.py | 230 ++++++++----------
 1 file changed, 107 insertions(+), 123 deletions(-)

diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
index 3f5fba8e6296..ec93103f8209 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
@@ -66,91 +66,116 @@ def __exit__(self, *args, **kwargs) -> None:
         tvm.register_func(self.FUNC_NAME, self.old_func, override=True)
 
 
-class TestRandomRelayE2ECorrectness:
-    """Tests E2E correctness of layout transform schedule.
+# Create unary functions which apply ops with compatible fusion levels to layout transform
+def get_random_axis(data: relay.Expr):
+    rank = len(relay.transform.InferTypeLocal(data).shape)
+    return random.randint(0, rank - 1)
 
-    Randomly generates relay mod with layout transform and fusable ops. Checks the
-    layout transform task for correctness by comparing against its unscheduled result.
-    """
 
-    # Create unary functions which apply ops with compatible fusion levels to layout transform
-    @staticmethod
-    def get_random_axis(data: relay.Expr):
-        rank = len(relay.transform.InferTypeLocal(data).shape)
-        return random.randint(0, rank - 1)
+def apply_elemwise_clip(data: relay.Expr, min=0, max=10):
+    assert relay.op.get("clip").get_attr("TOpPattern") == OpPattern.ELEMWISE
+    return relay.clip(data, min, max)
 
-    @staticmethod
-    def apply_elemwise_clip(data: relay.Expr, min=0, max=10):
-        assert relay.op.get("clip").get_attr("TOpPattern") == OpPattern.ELEMWISE
-        return relay.clip(data, min, max)
 
-    @staticmethod
-    def apply_broadcast_add(data: relay.Expr, val_to_add=5):
-        assert relay.op.get("add").get_attr("TOpPattern") == OpPattern.BROADCAST
-        type_info = relay.transform.InferTypeLocal(data)
-        return relay.add(data, relay.const(val_to_add, dtype=type_info.dtype))
+def apply_broadcast_add(data: relay.Expr, val_to_add=5):
+    assert relay.op.get("add").get_attr("TOpPattern") == OpPattern.BROADCAST
+    type_info = relay.transform.InferTypeLocal(data)
+    return relay.add(data, relay.const(val_to_add, dtype=type_info.dtype))
 
-    @staticmethod
-    def apply_injective_concatenate(data: relay.Expr, axis=None):
-        if axis is None:
-            axis = TestRandomRelayE2ECorrectness.get_random_axis(data)
-        assert relay.op.get("concatenate").get_attr("TOpPattern") == OpPattern.INJECTIVE
-        return relay.concatenate([data, data], axis)
 
-    @staticmethod
-    def apply_comm_reduce_max(data: relay.Expr, axis=None):
-        if axis is None:
-            axis = TestRandomRelayE2ECorrectness.get_random_axis(data)
-        assert relay.op.get("max").get_attr("TOpPattern") == OpPattern.COMM_REDUCE
+def apply_injective_concatenate(data: relay.Expr, axis=None):
+    if axis is None:
+        axis = get_random_axis(data)
+    assert relay.op.get("concatenate").get_attr("TOpPattern") == OpPattern.INJECTIVE
+    return relay.concatenate([data, data], axis)
 
-        # Do this to maintain dimensions
-        return relay.add(data, relay.max(data, axis, keepdims=True))
 
-    @staticmethod
-    def get_map_pattern_level_to_op() -> Dict[OpPattern, Callable]:
-        # These are the only levels of op which can possibly be fused with layout_transform (which injective)
-        return {
-            OpPattern.ELEMWISE: TestRandomRelayE2ECorrectness.apply_elemwise_clip,
-            OpPattern.BROADCAST: TestRandomRelayE2ECorrectness.apply_broadcast_add,
-            OpPattern.INJECTIVE: TestRandomRelayE2ECorrectness.apply_injective_concatenate,
-            OpPattern.COMM_REDUCE: TestRandomRelayE2ECorrectness.apply_comm_reduce_max,
-        }
+def apply_comm_reduce_max(data: relay.Expr, axis=None):
+    if axis is None:
+        axis = get_random_axis(data)
+    assert relay.op.get("max").get_attr("TOpPattern") == OpPattern.COMM_REDUCE
 
-    @staticmethod
-    def apply_layout_transform(data: relay.Expr, src_layout: str, dst_layout: str):
-        assert relay.op.get("layout_transform").get_attr("TOpPattern") == OpPattern.INJECTIVE
-        return relay.layout_transform(data, src_layout, dst_layout)
+    # Do this to maintain dimensions
+    return relay.add(data, relay.max(data, axis, keepdims=True))
 
-    @staticmethod
-    def create_relay_module(
-        input_shape: List[int], dtype: str, ops: List[Union[OpPattern, Tuple[str, str]]]
-    ) -> tvm.IRModule:
-        """Create a relay module with the given string of ops.
-
-        ops:
-            Applies the associated operators in order. If an integer, refers to applying
-            the unary operator from `extra_pattern_level_to_op` map. If a tuple, applies
-            a layout transform with the given (src_layout, dst_layout)
-        """
-        input_data = relay.var("input", shape=input_shape, dtype=dtype)
-
-        cur_data = input_data
-        for op_info in ops:
-            # Progressively build type info
-            relay.transform.InferTypeLocal(cur_data)
-            if isinstance(op_info, tuple):
-                # layout transform case
-                src_layout, dst_layout = op_info
-                cur_data = TestRandomRelayE2ECorrectness.apply_layout_transform(
-                    cur_data, src_layout, dst_layout
-                )
-            else:
-                cur_data = TestRandomRelayE2ECorrectness.get_map_pattern_level_to_op()[op_info](
-                    cur_data
-                )
 
+pattern_level_to_op = {
+    OpPattern.ELEMWISE: apply_elemwise_clip,
+    OpPattern.BROADCAST: apply_broadcast_add,
+    OpPattern.INJECTIVE: apply_injective_concatenate,
+    OpPattern.COMM_REDUCE: apply_comm_reduce_max,
+}
+
+
+def apply_layout_transform(data: relay.Expr, src_layout: str, dst_layout: str):
+    assert relay.op.get("layout_transform").get_attr("TOpPattern") == OpPattern.INJECTIVE
+    return relay.layout_transform(data, src_layout, dst_layout)
+
+
+def create_relay_module(
+    input_shape: List[int], dtype: str, ops: List[Union[OpPattern, Tuple[str, str]]]
+) -> tvm.IRModule:
+    """Create a relay module with the given string of ops.
+
+    ops:
+        Applies the associated operators in order. If an integer, refers to applying
+        the unary operator from `extra_pattern_level_to_op` map. If a tuple, applies
+        a layout transform with the given (src_layout, dst_layout)
+    """
+    input_data = relay.var("input", shape=input_shape, dtype=dtype)
+
+    cur_data = input_data
+    for op_info in ops:
+        # Progressively build type info
         relay.transform.InferTypeLocal(cur_data)
-        return tvm.IRModule.from_expr(cur_data)
+        if isinstance(op_info, tuple):
+            # layout transform case
+            src_layout, dst_layout = op_info
+            cur_data = apply_layout_transform(cur_data, src_layout, dst_layout)
+        else:
+            cur_data = pattern_level_to_op[op_info](cur_data)
+
+    relay.transform.InferTypeLocal(cur_data)
+    return tvm.IRModule.from_expr(cur_data)
+
+
+def extract_layout_transform_task(
+    mod: tvm.IRModule, target: tvm.target.Target
+) -> meta_schedule.ExtractedTask:
+    """Given a relay IRModule, return the PrimFunc IRModule with fused layout transform task."""
+    extracted_tasks = meta_schedule.relay_integration.extract_tasks(
+        mod,
+        target,
+        {},
+        pass_config={"relay.backend.use_meta_schedule": True},
+    )
+    task_of_interest = None
+    for task in extracted_tasks:
+        if "layout_transform" in task.task_name:
+            task_of_interest = task
+            break
+    assert task_of_interest is not None
+    return task_of_interest
+
+
+def run_primfunc(
+    primfunc_mod: tvm.IRModule, target: tvm.target.Target, input_tensors: List[tvm.nd.NDArray]
+):
+    """Compile and run the primfunc with the given input tensors."""
+    with tvm.transform.PassContext(
+        config={"relay.backend.use_meta_schedule": True},
+        opt_level=3,
+    ):
+        lib = tvm.build(primfunc_mod, target=target)
+    lib(*input_tensors)
+
+
+class TestRandomRelayE2ECorrectness:
+    """Tests E2E correctness of layout transform schedule.
+
+    Randomly generates relay mod with layout transform and fusable ops. Checks the
+    layout transform task for correctness by comparing against its unscheduled result.
+    """
 
     @staticmethod
     def generate_test_case(
@@ -181,7 +206,7 @@ def generate_test_case(
 
         # Randomly sample a list of potentially fusable ops to layout transform
         op_order = random.choices(
-            list(TestRandomRelayE2ECorrectness.get_map_pattern_level_to_op().keys()),
+            list(pattern_level_to_op.keys()),
             k=num_additional_ops,
         )
 
@@ -189,38 +214,7 @@ def generate_test_case(
         op_order.append((src_layout, dst_layout))
 
         random.shuffle(op_order)
-        return TestRandomRelayE2ECorrectness.create_relay_module(input_shape, dtype, op_order)
-
-    @staticmethod
-    def extract_layout_transform_task(
-        mod: tvm.IRModule, target: tvm.target.Target
-    ) -> meta_schedule.ExtractedTask:
-        """Given a relay IRModule, return the PrimFunc IRModule with fused layout transform task."""
-        extracted_tasks = meta_schedule.relay_integration.extract_tasks(
-            mod,
-            target,
-            {},
-            pass_config={"relay.backend.use_meta_schedule": True},
-        )
-        task_of_interest = None
-        for task in extracted_tasks:
-            if "layout_transform" in task.task_name:
-                task_of_interest = task
-                break
-        assert task_of_interest is not None
-        return task_of_interest
-
-    @staticmethod
-    def run_primfunc(
-        primfunc_mod: tvm.IRModule, target: tvm.target.Target, input_tensors: List[tvm.nd.NDArray]
-    ):
-        """Compile and run the primfunc with the given input tensors."""
-        with tvm.transform.PassContext(
-            config={"relay.backend.use_meta_schedule": True},
-            opt_level=3,
-        ):
-            lib = tvm.build(primfunc_mod, target=target)
-        lib(*input_tensors)
+        return create_relay_module(input_shape, dtype, op_order)
 
     @staticmethod
     def get_primfunc(extracted_task: meta_schedule.ExtractedTask, tile_size: Optional[int]):
@@ -270,13 +264,9 @@ def get_output_tensor() -> Tuple[tvm.nd.NDArray, tvm.nd.NDArray]:
             return tvm.nd.array(numpy_init, device)
 
         def run_and_get_output(tile_size: Optional[int]) -> np.ndarray:
-            returned_primfunc = TestRandomRelayE2ECorrectness.get_primfunc(
-                extracted_task, tile_size
-            )
+            returned_primfunc = TestRandomRelayE2ECorrectness.get_primfunc(extracted_task, tile_size)
             output_tensor = get_output_tensor()
-            TestRandomRelayE2ECorrectness.run_primfunc(
-                returned_primfunc, target, [*input_tensors, output_tensor]
-            )
+            run_primfunc(returned_primfunc, target, [*input_tensors, output_tensor])
             return output_tensor.numpy()
 
         # Passing None, we basically do not apply the custom rule we have created
@@ -329,7 +319,7 @@ def test_all_test_case(
         )
 
         # Fused layout transform task
-        extracted_task = self.extract_layout_transform_task(full_mod, TARGET)
+        extracted_task = extract_layout_transform_task(full_mod, TARGET)
         self.verify_layout_transform_task(extracted_task, TARGET, tile_sizes)
 
 
@@ -338,9 +328,7 @@ class TestManualCases:
     def assert_extracted_equals_expected(
         self, relay_mod: tvm.IRModule, expected_mod: tvm.IRModule, tile_size: int
     ):
-        extracted_task = TestRandomRelayE2ECorrectness.extract_layout_transform_task(
-            relay_mod, TARGET
-        )
+        extracted_task = extract_layout_transform_task(relay_mod, TARGET)
         dispatched_mod = extracted_task.dispatched[0]
         sch = tvm.tir.Schedule(dispatched_mod)
         block = sch.get_block("T_layout_trans")
@@ -348,9 +336,7 @@ def assert_extracted_equals_expected(
         assert output_sch.mod.script() == expected_mod.script()
 
     def test_simple_tiling(self):
-        mod = TestRandomRelayE2ECorrectness.create_relay_module(
-            [1, 32, 32, 32], "float16", [("NCHW", "NHWC")]
-        )
+        mod = create_relay_module([1, 32, 32, 32], "float16", [("NCHW", "NHWC")])
 
         # Main things to notice:
         # - two blocks each with 16, 16 extents which write/read shared mem
@@ -389,9 +375,7 @@ def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "floa
         self.assert_extracted_equals_expected(mod, ExpectedModule, 16)
 
     def test_simple_implicit_reshape(self):
-        mod = TestRandomRelayE2ECorrectness.create_relay_module(
-            [1, 32, 32, 32], "float16", [("NCHW", "NCHW4c")]
-        )
+        mod = create_relay_module([1, 32, 32, 32], "float16", [("NCHW", "NCHW4c")])
 
         # Main things to notice:
         # - two blocks each with 16, 16 extents which write/read shared mem
@@ -432,7 +416,7 @@ def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "floa
         self.assert_extracted_equals_expected(mod, ExpectedModule, 16)
 
     def test_expected_fusion_post(self):
-        mod = TestRandomRelayE2ECorrectness.create_relay_module(
+        mod = create_relay_module(
             [1, 32, 32, 32], "float16", [("NCHW", "NCHW4c"), OpPattern.BROADCAST]
         )
 

From 7fdcb690b94634112a9681a3a09b6be0293c3d9c Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Mon, 20 Mar 2023 10:56:29 -0700
Subject: [PATCH 37/40] remove extraneous check

---
 .../postproc/rewrite_cooperative_fetch.cc             | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
index 8cdb6a1e92a6..19682e6c614a 100644
--- a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
+++ b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
@@ -40,16 +40,7 @@ Optional<Integer> ParseThreadBinding(const Schedule& sch, const Instruction& ins
     return NullOpt;
   }
 
-  try {
-    return Downcast<Integer>(sch->Get(Downcast<LoopRV>(inst->inputs[0]))->extent);
-  } catch (const std::exception& e) {
-    // This can occur if in a schedule we manually bind threads in the middle of a schedule
-    // and then later modify the schedule. As the passed in schedule is after running the entire
-    // trace the bound loop may be moved around in the IRModule.
-    LOG(DEBUG) << "Failed to calculate extent so skipping RewriteCooperativeFetching. Error "
-               << e.what();
-    return NullOpt;
-  }
+  return Downcast<Integer>(sch->Get(Downcast<LoopRV>(inst->inputs[0]))->extent);
 }
 
 /*!

From a5d8f5f50161e27af83031c48e020f3aee0d8a08 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Tue, 21 Mar 2023 15:25:46 -0700
Subject: [PATCH 38/40] lint again :/

---
 .../test_meta_schedule_schedule_cuda_layout_transform.py      | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
index ec93103f8209..06b6514ee3aa 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
@@ -264,7 +264,9 @@ def get_output_tensor() -> Tuple[tvm.nd.NDArray, tvm.nd.NDArray]:
             return tvm.nd.array(numpy_init, device)
 
         def run_and_get_output(tile_size: Optional[int]) -> np.ndarray:
-            returned_primfunc = TestRandomRelayE2ECorrectness.get_primfunc(extracted_task, tile_size)
+            returned_primfunc = TestRandomRelayE2ECorrectness.get_primfunc(
+                extracted_task, tile_size
+            )
             output_tensor = get_output_tensor()
             run_primfunc(returned_primfunc, target, [*input_tensors, output_tensor])
             return output_tensor.numpy()

From bf60774dfffb42ba0bac1695c4a26f19e7574c8d Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Tue, 21 Mar 2023 15:29:07 -0700
Subject: [PATCH 39/40] remove uneeded newline

---
 src/meta_schedule/postproc/rewrite_cooperative_fetch.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
index 19682e6c614a..353b90c36423 100644
--- a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
+++ b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
@@ -39,7 +39,6 @@ Optional<Integer> ParseThreadBinding(const Schedule& sch, const Instruction& ins
   if (thread_axis != axis) {
     return NullOpt;
   }
-
   return Downcast<Integer>(sch->Get(Downcast<LoopRV>(inst->inputs[0]))->extent);
 }
 

From 2c1047a05f50421dc36888cfed3405bec03f4883 Mon Sep 17 00:00:00 2001
From: Andrew Zhao Luo <andrew.zhao.luo@gmail.com>
Date: Thu, 23 Mar 2023 09:54:59 -0700
Subject: [PATCH 40/40] remove leading spaces

---
 .../test_meta_schedule_schedule_cuda_layout_transform.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
index 06b6514ee3aa..d1ba84d836be 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py
@@ -373,7 +373,7 @@ def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "floa
                                 T.writes(T_layout_trans[v_ax0, v_ax1, v_ax2, v_ax3])
                                 T.block_attr({"dst_layout": "NHWC", "input_shape": [1, 32, 32, 32], "schedule_rule": "layout_transform", "src_layout": "NCHW"})
                                 T_layout_trans[v_ax0, v_ax1, v_ax2, v_ax3] = T.if_then_else(v_ax0 < T.int64(1) and v_ax3 < T.int64(32) and v_ax1 < T.int64(32) and v_ax2 < T.int64(32), p0_shared[v_ax0, v_ax3, v_ax1, v_ax2], T.float16(0))
-        
+
         self.assert_extracted_equals_expected(mod, ExpectedModule, 16)
 
     def test_simple_implicit_reshape(self):