From 1df07f7b0241a08d6d463f592078378bb8c7cff3 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Wed, 1 Mar 2023 15:11:54 -0800 Subject: [PATCH 01/40] initial basis --- include/tvm/topi/transform.h | 10 +- python/test_2.py | 71 ++++++ python/tvm/relay/op/_transform.py | 2 +- python/tvm/relay/op/strategy/cuda.py | 261 +++++++++++++++++++++++ python/tvm/relay/op/strategy/generic.py | 36 +++- python/tvm/topi/cuda/batch_matmul.py | 11 +- python/tvm/topi/transform.py | 17 +- python/tvm/topi/x86/new.py | 12 ++ src/tir/schedule/primitive/compute_at.cc | 5 + src/topi/transform.cc | 2 +- 10 files changed, 406 insertions(+), 21 deletions(-) create mode 100644 python/test_2.py create mode 100644 python/tvm/topi/x86/new.py diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h index dff6374a6185..51058485838f 100644 --- a/include/tvm/topi/transform.h +++ b/include/tvm/topi/transform.h @@ -1596,8 +1596,10 @@ inline Array meshgrid(const Array& inputs, const std::string& in */ inline Tensor layout_transform(const Tensor& src, const std::string& src_layout, const std::string& dst_layout, + const std::string schedule_rule = "None", const std::string name = "T_layout_trans", const std::string tag = kInjective) { + // LOG(FATAL) << "WHATTT"; Layout src_layout_struct(src_layout); Layout dst_layout_struct(dst_layout); @@ -1614,6 +1616,12 @@ inline Tensor layout_transform(const Tensor& src, const std::string& src_layout, Array dst_shape = layout_converter.ForwardShape(src->shape); + Map attrs = { + {"schedule_rule", String(schedule_rule)}, + {"src_layout", String(src_layout)}, + {"dst_layout", String(dst_layout)} + }; + return compute( dst_shape, [&](const Array& dst_indices) { @@ -1625,7 +1633,7 @@ inline Tensor layout_transform(const Tensor& src, const std::string& src_layout, } return if_then_else(in_range, src(src_indices), tvm::cast(src->dtype, PrimExpr(0))); }, - name, tag); + name, tag, attrs); } /*! \brief Utility function for auto_scheduler_layout_transform */ diff --git a/python/test_2.py b/python/test_2.py new file mode 100644 index 000000000000..e0ddc33d1586 --- /dev/null +++ b/python/test_2.py @@ -0,0 +1,71 @@ +import onnx + +import tvm.relay +from tvm import meta_schedule as ms + +mod, params = tvm.relay.frontend.from_onnx( + onnx.load("resnet50-v1-12-int8.onnx"), shape={"data": [1, 3, 224, 224]}, freeze_params=True +) + + +def apply_relay_passes( + mod: tvm.IRModule, +): + """Applies relay passes to the input IRModule. + + :param mod: The input IRModule + :return: The IRModule after all the relays passes have been applied + """ + # N.B. Defer the import so as not to unconditionally require other runtimes. + from tvm import relay, transform + from tvm.relay.op.contrib.dnnl import rewrite_layer_norm + + mod = rewrite_layer_norm(mod) + + passes = [] + + # If the inputs are static, run DynamicToStatic to remove + # any residual dynamism in the model. + # If the inputs are dynamic, this pass is much more expensive + # and will not remove dynamism from the model, so we skip it. + passes.append(relay.transform.DynamicToStatic()) + + # Infer types prior to the quantization pass below as some + # transforms might need them. + passes.append(relay.transform.InferType()) + + # Transform fake quantized sub-graphs to actual integer ops. + # Should have no effect on graphs without the relevant patterns. + passes.append(relay.transform.FakeQuantizationToInteger()) + + passes.append(relay.transform.FastMath()) + + # Fold constants after FQ2I becuase some weights are stored in FP32. + passes.append(relay.transform.FoldConstant()) + + # Use sequential to solve for dependent passes + seq = transform.Sequential(passes) + + with tvm.transform.PassContext(opt_level=4): + mod = seq(mod) + + mod = relay.transform.InferType()(mod) + # mod["main"] = rewrite(relay.qnn.transform.LayerNormQuantizedRewrite(), mod["main"]) + + return mod + + +mod = apply_relay_passes(mod) + +print(mod) +target = tvm.target.Target("nvidia/geforce-rtx-3070") +work_dir = "resnet_tune" + +db = ms.relay_integration.tune_relay(mod, params, target, work_dir, 1000) + +""" +Works: +Does not: +57de9e7f3d2711582368903ce95f08b91216b7b5 Mon Nov 28 21:28:37 2022 -0800 + +""" diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py index 4499d5e2266e..93df67ff6b99 100644 --- a/python/tvm/relay/op/_transform.py +++ b/python/tvm/relay/op/_transform.py @@ -94,7 +94,7 @@ def compute_strided_set(attrs, inputs, output_type): _reg.register_injective_schedule("strided_set") # layout_transform -_reg.register_injective_schedule("layout_transform") +_reg.register_strategy("layout_transform", strategy.layout_transform_strategy) _reg.register_pattern("layout_transform", OpPattern.INJECTIVE) _reg.register_injective_schedule("auto_scheduler_layout_transform") _reg.register_pattern("auto_scheduler_layout_transform", OpPattern.INJECTIVE) diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index 416637c14905..404b5eb424a1 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -1396,3 +1396,264 @@ def dft_strategy_cuda(attrs, inputs, out_type, target): name="dft.cuda", ) return strategy + +@layout_transform_strategy.register(["cuda", "gpu"]) +def layout_transform_strategy(attrs, inputs, out_type, target): + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_layout_transform(topi.layout_transform, schedule_rule="layout_transform"), + wrap_topi_schedule(topi.cuda.schedule_injective), + name="layout_transform.cuda", + ) + return strategy + + +# "meta_schedule.cuda.layout_transform" +import tvm + +@tvm.register_func("meta_schedule.cuda.layout_transform") +def cuda_layout_transform_schedule_rule(sch, block): + # params: input_buffer, output_buffer + params = sch.mod["main"].params + input_buffer = sch.mod["main"].buffer_map[params[0]] + output_buffer = sch.mod["main"].buffer_map[params[1]] + + input_shape = [int(dim) for dim in input_buffer.shape] + output_shape = [int(dim) for dim in output_buffer.shape] + + src_layout = sch.get_sref(block).stmt.annotations["src_layout"] + dst_layout = sch.get_sref(block).stmt.annotations["dst_layout"] + + import math + from typing import List, Sequence, Tuple + + from tvm.tir.schedule import BlockRV, ExprRV, LoopRV + + def schedule_layout_transform_v4( + sch: tvm.tir.Schedule, + src_layout: str, + dst_layout: str, + input_shape: List[int], + tile_size: ExprRV, + ): + ## Tiling block_read + # Let N and M represent the dimensions of interest + # N and M are the last dim of src_layout and dst_layout respectively. + # Then the initial block's loop will look like + # [i1, i2 ... M ... j1, j2 ... N] + # + # To guarantee contiguous read for N, we must group reads + # so that loops which factor N, j_n, j_n-1 ... are the innermost dimension + # Therefore our strategy for guaranteeing contiguous writes for N is by + # continually splitting inner-most dimension in order of N, j_n, j_n-1... + # by factors which divide into tile_size until we have the final tile_size. + # e.g. if tile_size = 32. N = 2, j_n = 2, j_n-1 = 4, j_n-2 = 24 + # Then by combining N...j_n-1 we get a factor of 16, by spliiting up j_n-2 into + # two loops of 2 and 12, we can combine with the new loop of factor 2 to get a + # factor of 32. Note things don't divide evenly often time so we may have to pad + # to properly factorize. + # + # Similarly with M, to have contiguous writes we must consider the dst_layout: + # [a1, a2 ... N ... b1, b2 ... M] + # So that loops which factor M, b_m, b_m-1 ... are the innermost dimension + # Note that the dimension b_m, b_m-1 and j_n, j_n-1 may refer to the same dimension! + # By factoring j_n, j_n-1 we may be contributing to the innermost write dimension for + # M. However we note that both reads and writes must ideally have the same amount of + # work per thread for layout transforms so we still must build out to factors up + # to tile_size. + + def pad_dimension_to_at_least_number(loop: LoopRV, requested_size: int): + """E.g. if loop has extant of 8 but we want 10, returns size 10 loop with padding""" + l1, l2 = sch.split(loop, [None, requested_size]) + return sch.fuse(l1, l2) + + def pad_dimension_to_factor_of_tile_size( + loop: LoopRV, initial_size: int, tile_size: int = tile_size + ) -> Tuple[LoopRV, int]: + """ + Pads loop of given size until it is divisble into tile_size. + If the given size of the loop is greater than tile size. Do not pad. + + example, loop_size = 5, tile_size = 32. loop_size --> 8 + loop_size = 5, tile_size = 36. loop_size --> 6 + loop_size = 8, tile_size = 32. loop_size --> 8 + loop_size = 33, tile_size = 32. loop_size --> 33 + + Returns padded loopRV and the new size + """ + if tile_size % initial_size == 0: + return loop, int(initial_size) + + if initial_size > tile_size or initial_size == tile_size: + return loop, int(initial_size) + + # if initial_size > tile_size return without change, factor = 1 + size = initial_size + while (tile_size % size) % tile_size > 0: + size += 1 + + return pad_dimension_to_at_least_number(loop, size), int(size) + + def spin_out_factor( + loops: List[LoopRV], loop_extants: List[int], index: int, factor_needed: int + ) -> Tuple[List[LoopRV], List[int], int]: + """ + Factor out loop extant to reach the requested factor. Updates the schedule in-place. + + E.g. say we want to factors which eventually multiply to 32 (factor_needed). + + Say we have the index we chose is a loop with an extant of 8. + E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed = 32, index = 3 + - 8 divides into 32 so we just split up the loop into two loops with extants 1 and 8. + - we then keep the 1-loop in place and move the new 8-loop to back of the list of loops + - ending loops / loop_extants = [3, 32, 6, 1, 8], remaining_factor_needed = 32 / 8 = 4 + + E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed=32, index = 0 + - 3 does not divide 32, so we pad until the extant divides 32, e.g. 4 + - we then split up the loop into extants 1 and 4, moving the 4 to the back + - ending loops / loop_extants = [1, 32, 6, 8, 4], remaining_factor_needed = 32 / 4 = 8 + + E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed=5, index = 3 + - 8 is larger than 5 so we immediately do the splitting routine. + - the 8 extant loop becomes loops with extants 2 and 5 + - ending loops / loop_extants = [1, 32, 6, 2, 5], remaining_factor_needed = 5 / 5 = 1 + + After updating loop ordering in place, returns the new list of loops, extants, and the + remaining factor needed. + """ + cur_loop = loops[index] + cur_extant = loop_extants[index] + + # Pad loops to divide evenly for factors needed, and split + new_loop, new_size = pad_dimension_to_factor_of_tile_size( + cur_loop, cur_extant, tile_size=factor_needed + ) + + split_factor = min(new_size, factor_needed) + new_loop_split, factored_loop = sch.split(new_loop, [None, split_factor]) + factor_needed = factor_needed // split_factor + + # update caching + loops[index] = new_loop_split + loops.append(factored_loop) + + loop_extants[index] = math.ceil(new_size / split_factor) + loop_extants.append(split_factor) + + sch.reorder(*loops) + return loops, loop_extants, factor_needed + + def factor_dim_in_order( + indices: Sequence[int], + loops: List[LoopRV], + cur_loop_extants: List[int], + work_needed_inner_loop: int = tile_size, + ): + """TODO""" + for i in indices: + if work_needed_inner_loop == 1: + break + loops, cur_loop_extants, work_needed_inner_loop = spin_out_factor( + loops, cur_loop_extants, i, work_needed_inner_loop + ) + return loops, cur_loop_extants + + def get_high_level_loop_structure(block): + """Runs the factorization described above.""" + # index 0 ... rank - 1 will always correspond to original loops + # perhaps after they have been factored. + loops = sch.get_loops(block) + cur_loop_extants = list(input_shape) + + # Factor dim0 tile size and fuse things together + loops, cur_loop_extants = factor_dim_in_order( + range(rank - 1, -1, -1), + loops, + cur_loop_extants, + work_needed_inner_loop=tile_size, + ) + # The factors which multiply to tile_size are now in back of our + # list of loops. However because we added them by traversing the inner + # dimensions, they are actually reversed order to guarantee the best access + # so reorder so reorder before fusing. + loops = loops[:rank] + loops[rank:][::-1] + cur_loop_extants = cur_loop_extants[:rank] + cur_loop_extants[rank::-1] + sch.reorder(*loops) + dim0_loop_tiled = sch.fuse(*loops[rank:]) + loops = loops[:rank] + loops.append(dim0_loop_tiled) + cur_loop_extants = cur_loop_extants[:rank] + cur_loop_extants.append(tile_size) + + # Same thing with dim1 + # [:rank + 1], since we placed dim0_loop_tiled in the end which we want to keep + loops, cur_loop_extants = factor_dim_in_order( + ( + src_layout.index(dst_layout[loop_index_dst]) + for loop_index_dst in range(rank - 1, -1, -1) + ), + loops, + cur_loop_extants, + work_needed_inner_loop=tile_size, + ) + loops = loops[: rank + 1] + loops[rank + 1 :][::-1] + cur_loop_extants = cur_loop_extants[: rank + 1] + cur_loop_extants[rank + 1 :: -1] + sch.reorder(*loops) + dim1_loop_tiled = sch.fuse(*loops[rank + 1 :]) + loops = loops[: rank + 1] + loops.append(dim1_loop_tiled) + cur_loop_extants = cur_loop_extants[: rank + 1] + cur_loop_extants.append(tile_size) + + rank = len(src_layout) + + # Assume write to output global memory is coalesced + block_write = sch.get_block(name="T_layout_trans", func_name="main") + + # Outer loop structure of read block matches that of src_layout + # E.g. if input_shape is [4, 6, 8]. Loops for read block will be + # for i, j, k in T.grid(4, 6, 8): + # ... + # Read block will read from global memory coalesced at the start + block_read = sch.cache_read(block_write, 0, "shared") + + # Here we have [loop1, loop2, loop3 ... dim0_tiled, dim1_tiled] + get_high_level_loop_structure(block_read) + + loops = sch.get_loops(block_read) + + # If there are insufficient elements, than dim1_tiled or dim0_tiled might be too small + # In all likelihood you should use a smaller tile, but I don't want things to crash. + loops[-1] = pad_dimension_to_at_least_number(loops[-1], tile_size) + loops[-2] = pad_dimension_to_at_least_number(loops[-2], tile_size) + + # We want the dim0 and dim1 parent loops to be the inner most. Right now dim1 is inner-msot + # and we just need to move dim0 in (last dimension of dst). + # Recall right now structure is at least [l1 l2 ... ln, dim0_tiled, dim1_tiled] + # where n >= 2. + dim0_loop_index = src_layout.index(dst_layout[-1]) + dim0_loop = loops.pop(dim0_loop_index) + loops = loops[:-3] + [dim0_loop, loops[-3]] + loops[-2:] + sch.reorder(*loops) + + # After this: [outer_loop (block), dim0_tiled, dim1_tiled] + outer_loop = sch.fuse(*loops[:-2]) + + # Now that we have the high level loop structure, we can use reverse_compute_at magic + # To get the proper loop structure for writing! This is also as coalesced as possible + # already. + sch.reverse_compute_at(block_write, outer_loop) + + # Fuse all inner loops for the write into 2 loops, grab inner loops for both read + # and write block which have locality (we will bind these to threadIdx) + fused_write_loop = sch.fuse(*sch.get_loops(block_write)[1:]) + _, inner_write_loop = sch.split(fused_write_loop, [None, tile_size]) + inner_read_loop = sch.get_loops(block_read)[-2] + + sch.bind(loop=outer_loop, thread_axis="blockIdx.x") + sch.bind(loop=inner_write_loop, thread_axis="threadIdx.x") + sch.bind(loop=inner_read_loop, thread_axis="threadIdx.x") + + # tile_size = sch.sample_categorical([8, 16, 32, 64], [0.25] * 4) + schedule_layout_transform_v4(sch, src_layout, dst_layout, input_shape, 32) + return [sch] diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py index 733b630fc4da..2b29f5c28954 100644 --- a/python/tvm/relay/op/strategy/generic.py +++ b/python/tvm/relay/op/strategy/generic.py @@ -21,12 +21,8 @@ from tvm import _ffi, ir, te, topi from tvm.target import generic_func, override_native_generic_func -from tvm.topi.utils import ( - get_const_float, - get_const_int, - get_const_tuple, - get_float_tuple, -) +from tvm.topi.utils import (get_const_float, get_const_int, get_const_tuple, + get_float_tuple) from .. import op as _op @@ -2060,3 +2056,31 @@ def conv2d_backward_weight_strategy(attrs, inputs, out_type, target): "conv2d_backward_weight is currently only supported with cudnn. " "Please run Legalize pass to decompose this op into supported ops." ) + + +@override_native_generic_func("layout_transform_strategy") +def layout_transform_strategy(attrs, inputs, out_type, target): + """layout transform generic strategy""" + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_layout_transform(topi.layout_transform), + wrap_topi_schedule(topi.generic.schedule_injective), + name="layout_transform.generic", + ) + return strategy + + +def wrap_compute_layout_transform(topi_compute, schedule_rule=""): + """Wrap stft compute""" + + def _compute_layout_transform(attrs, inputs, output_type): + return [ + topi_compute( + inputs[0], + attrs.src_layout, + attrs.dst_layout, + schedule_rule, + ) + ] + + return _compute_layout_transform diff --git a/python/tvm/topi/cuda/batch_matmul.py b/python/tvm/topi/cuda/batch_matmul.py index d2f5c9b9c586..829d2d68940d 100644 --- a/python/tvm/topi/cuda/batch_matmul.py +++ b/python/tvm/topi/cuda/batch_matmul.py @@ -17,12 +17,12 @@ # pylint: disable=invalid-name,too-many-locals,unused-variable,unused-argument """cuda batch_matmul operators""" import tvm -from tvm import autotvm -from tvm import te +from tvm import autotvm, te +from tvm.autotvm.task.space import OtherOptionEntity, SplitEntity from tvm.contrib import cublas -from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity -from .. import nn, generic -from ..utils import traverse_inline, get_const_tuple, get_max_power2_factor + +from .. import generic, nn +from ..utils import get_const_tuple, get_max_power2_factor, traverse_inline from .tensor_intrin import dp4a @@ -87,6 +87,7 @@ def schedule_batch_matmul(cfg, outs): s: Schedule The computation schedule for the op. """ + breakpoint() outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs s = te.create_schedule([x.op for x in outs]) diff --git a/python/tvm/topi/transform.py b/python/tvm/topi/transform.py index 23334da9c25c..e4fe3c583990 100644 --- a/python/tvm/topi/transform.py +++ b/python/tvm/topi/transform.py @@ -17,13 +17,13 @@ # pylint: disable=invalid-name,consider-using-enumerate,redefined-outer-name """Injective transformation operators""" from __future__ import absolute_import as _abs + import tvm -from tvm import te -from tvm import topi +from tvm import te, topi from tvm.te import hybrid -from . import cpp -from . import tag -from .utils import within_index, make_idx, const_vector + +from . import cpp, tag +from .utils import const_vector, make_idx, within_index def expand_dims(a, axis, num_newaxis=1): @@ -636,7 +636,7 @@ def tile(a, reps): return cpp.tile(a, reps) -def layout_transform(array, src_layout, dst_layout): +def layout_transform(array, src_layout, dst_layout, schedule_rule="None"): """Transform the layout according to src_layout and dst_layout Parameters @@ -649,8 +649,11 @@ def layout_transform(array, src_layout, dst_layout): dst_layout : str the destination layout. + + schedule_rule : str + the schedule rule to apply if any """ - return cpp.layout_transform(array, src_layout, dst_layout) + return cpp.layout_transform(array, src_layout, dst_layout, schedule_rule) def shape(array, dtype="int32"): diff --git a/python/tvm/topi/x86/new.py b/python/tvm/topi/x86/new.py new file mode 100644 index 000000000000..e611b5a84a51 --- /dev/null +++ b/python/tvm/topi/x86/new.py @@ -0,0 +1,12 @@ +import tensorflow as tf + +indices = tf.constant([[1], [1]]) +updates = tf.constant( + [ + [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]], + [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]], + ] +) +shape = tf.constant([4, 4, 4]) +scatter = tf.scatter_nd(indices, updates, shape) +print(scatter) diff --git a/src/tir/schedule/primitive/compute_at.cc b/src/tir/schedule/primitive/compute_at.cc index 988c73c3f071..e027cfdd5742 100644 --- a/src/tir/schedule/primitive/compute_at.cc +++ b/src/tir/schedule/primitive/compute_at.cc @@ -706,6 +706,11 @@ void ComputeAtOrReverseComputeAtImpl(ScheduleState self, const StmtSRef& block_s /*provided_regions=*/std::move(provided_regions), /*required_regions=*/std::move(required_regions), /*analyzer=*/analyzer); + + for (int i = 0; i < iter_doms.size(); i++) { + //LOG(WARNING) << i << " " << iter_doms[i].dom << " " << iter_doms[i].bound << std::endl; + } + // Step 6. Create the new scope according to the iteration domain reconstructor.MakeNewLoop(/*insert_position=*/insert_position, /*iter_doms=*/std::move(iter_doms), /*analyzer=*/analyzer, /*preserve_unit_loops=*/preserve_unit_loops); diff --git a/src/topi/transform.cc b/src/topi/transform.cc index 0ea1392e5daf..bbefa19c2055 100644 --- a/src/topi/transform.cc +++ b/src/topi/transform.cc @@ -87,7 +87,7 @@ TVM_REGISTER_GLOBAL("topi.split").set_body([](TVMArgs args, TVMRetValue* rv) { }); TVM_REGISTER_GLOBAL("topi.layout_transform").set_body([](TVMArgs args, TVMRetValue* rv) { - *rv = layout_transform(args[0], args[1], args[2]); + *rv = layout_transform(args[0], args[1], args[2], args[3]); }); TVM_REGISTER_GLOBAL("topi.take").set_body([](TVMArgs args, TVMRetValue* rv) { From 8748bc819d30b61877d7305ac335b555046562ae Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Wed, 1 Mar 2023 15:40:46 -0800 Subject: [PATCH 02/40] Generated all the tile sizes --- python/tvm/relay/op/strategy/cuda.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index 404b5eb424a1..b3d7909c614b 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -1408,7 +1408,6 @@ def layout_transform_strategy(attrs, inputs, out_type, target): return strategy -# "meta_schedule.cuda.layout_transform" import tvm @tvm.register_func("meta_schedule.cuda.layout_transform") @@ -1551,11 +1550,11 @@ def factor_dim_in_order( ): """TODO""" for i in indices: - if work_needed_inner_loop == 1: - break loops, cur_loop_extants, work_needed_inner_loop = spin_out_factor( loops, cur_loop_extants, i, work_needed_inner_loop ) + if work_needed_inner_loop == 1: + break return loops, cur_loop_extants def get_high_level_loop_structure(block): @@ -1654,6 +1653,16 @@ def get_high_level_loop_structure(block): sch.bind(loop=inner_write_loop, thread_axis="threadIdx.x") sch.bind(loop=inner_read_loop, thread_axis="threadIdx.x") - # tile_size = sch.sample_categorical([8, 16, 32, 64], [0.25] * 4) - schedule_layout_transform_v4(sch, src_layout, dst_layout, input_shape, 32) - return [sch] + schedules = [] + + # Tile size 2,3,4...64 + # Tile size of 1 does not make sense... + for tile_size in range(2, 65): + cur_sch = sch.copy() + schedule_layout_transform_v4(cur_sch, src_layout, dst_layout, input_shape, tile_size) + schedules.append(cur_sch) + + # Also include the default schedules which will be handled via AutoBind schedule rule + schedules.append(sch) + + return schedules From 54d36255ffc28f676cee0537bc837839bc3f4484 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Thu, 2 Mar 2023 12:34:08 -0800 Subject: [PATCH 03/40] is this all you need? --- python/tvm/relay/op/strategy/cuda.py | 34 ++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index b3d7909c614b..6c600534cd28 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -1430,6 +1430,7 @@ def cuda_layout_transform_schedule_rule(sch, block): def schedule_layout_transform_v4( sch: tvm.tir.Schedule, + block_write: BlockRV, src_layout: str, dst_layout: str, input_shape: List[int], @@ -1606,14 +1607,12 @@ def get_high_level_loop_structure(block): rank = len(src_layout) - # Assume write to output global memory is coalesced - block_write = sch.get_block(name="T_layout_trans", func_name="main") - # Outer loop structure of read block matches that of src_layout # E.g. if input_shape is [4, 6, 8]. Loops for read block will be # for i, j, k in T.grid(4, 6, 8): # ... # Read block will read from global memory coalesced at the start + # Assume write to output global memory is coalesced in block_write block_read = sch.cache_read(block_write, 0, "shared") # Here we have [loop1, loop2, loop3 ... dim0_tiled, dim1_tiled] @@ -1653,16 +1652,41 @@ def get_high_level_loop_structure(block): sch.bind(loop=inner_write_loop, thread_axis="threadIdx.x") sch.bind(loop=inner_read_loop, thread_axis="threadIdx.x") + from collections import deque + def auto_inline(start_block): + # BFS from start block in a chain (no branches) + fringe = deque([start_block]) + visited = set() + while len(fringe) > 0: + cur_block = fringe.popleft() + if cur_block in visited: + continue + else: + visited.add(cur_block) + + consumer_blocks = sch.get_consumers(cur_block) + + if len(consumer_blocks) >= 1: + fringe.extend(consumer_blocks) + sch.compute_inline(cur_block) + else: + # consumer yay! + return cur_block + schedules = [] + + # For each schedule we also want to inline each stage as would be done in normal circumstances + # The block which producers the layout transform block seems + block = auto_inline(block) # Tile size 2,3,4...64 # Tile size of 1 does not make sense... for tile_size in range(2, 65): cur_sch = sch.copy() - schedule_layout_transform_v4(cur_sch, src_layout, dst_layout, input_shape, tile_size) + schedule_layout_transform_v4(cur_sch, block, src_layout, dst_layout, input_shape, tile_size) schedules.append(cur_sch) # Also include the default schedules which will be handled via AutoBind schedule rule schedules.append(sch) - + return schedules From 6ccea68079ea89411b79735eec26d1fe7a92c61b Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Thu, 2 Mar 2023 15:34:52 -0800 Subject: [PATCH 04/40] linting lint move schedule rule to own file lint p2 layout transform fixings --- include/tvm/topi/transform.h | 10 +- python/test_2.py | 71 ---- .../meta_schedule/schedule/cuda/__init__.py | 2 + .../schedule/cuda/layout_transform.py | 321 ++++++++++++++++++ python/tvm/relay/op/strategy/cuda.py | 285 +--------------- python/tvm/relay/op/strategy/generic.py | 5 +- python/tvm/topi/cuda/batch_matmul.py | 11 +- python/tvm/topi/x86/new.py | 12 - src/tir/schedule/primitive/compute_at.cc | 5 - 9 files changed, 335 insertions(+), 387 deletions(-) delete mode 100644 python/test_2.py create mode 100644 python/tvm/meta_schedule/schedule/cuda/layout_transform.py delete mode 100644 python/tvm/topi/x86/new.py diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h index 51058485838f..0d0c5d5962ce 100644 --- a/include/tvm/topi/transform.h +++ b/include/tvm/topi/transform.h @@ -1599,7 +1599,6 @@ inline Tensor layout_transform(const Tensor& src, const std::string& src_layout, const std::string schedule_rule = "None", const std::string name = "T_layout_trans", const std::string tag = kInjective) { - // LOG(FATAL) << "WHATTT"; Layout src_layout_struct(src_layout); Layout dst_layout_struct(dst_layout); @@ -1616,11 +1615,10 @@ inline Tensor layout_transform(const Tensor& src, const std::string& src_layout, Array dst_shape = layout_converter.ForwardShape(src->shape); - Map attrs = { - {"schedule_rule", String(schedule_rule)}, - {"src_layout", String(src_layout)}, - {"dst_layout", String(dst_layout)} - }; + Map attrs = {{"schedule_rule", String(schedule_rule)}, + // Information about layouts needed for the schedule rule + {"src_layout", String(src_layout)}, + {"dst_layout", String(dst_layout)}}; return compute( dst_shape, diff --git a/python/test_2.py b/python/test_2.py deleted file mode 100644 index e0ddc33d1586..000000000000 --- a/python/test_2.py +++ /dev/null @@ -1,71 +0,0 @@ -import onnx - -import tvm.relay -from tvm import meta_schedule as ms - -mod, params = tvm.relay.frontend.from_onnx( - onnx.load("resnet50-v1-12-int8.onnx"), shape={"data": [1, 3, 224, 224]}, freeze_params=True -) - - -def apply_relay_passes( - mod: tvm.IRModule, -): - """Applies relay passes to the input IRModule. - - :param mod: The input IRModule - :return: The IRModule after all the relays passes have been applied - """ - # N.B. Defer the import so as not to unconditionally require other runtimes. - from tvm import relay, transform - from tvm.relay.op.contrib.dnnl import rewrite_layer_norm - - mod = rewrite_layer_norm(mod) - - passes = [] - - # If the inputs are static, run DynamicToStatic to remove - # any residual dynamism in the model. - # If the inputs are dynamic, this pass is much more expensive - # and will not remove dynamism from the model, so we skip it. - passes.append(relay.transform.DynamicToStatic()) - - # Infer types prior to the quantization pass below as some - # transforms might need them. - passes.append(relay.transform.InferType()) - - # Transform fake quantized sub-graphs to actual integer ops. - # Should have no effect on graphs without the relevant patterns. - passes.append(relay.transform.FakeQuantizationToInteger()) - - passes.append(relay.transform.FastMath()) - - # Fold constants after FQ2I becuase some weights are stored in FP32. - passes.append(relay.transform.FoldConstant()) - - # Use sequential to solve for dependent passes - seq = transform.Sequential(passes) - - with tvm.transform.PassContext(opt_level=4): - mod = seq(mod) - - mod = relay.transform.InferType()(mod) - # mod["main"] = rewrite(relay.qnn.transform.LayerNormQuantizedRewrite(), mod["main"]) - - return mod - - -mod = apply_relay_passes(mod) - -print(mod) -target = tvm.target.Target("nvidia/geforce-rtx-3070") -work_dir = "resnet_tune" - -db = ms.relay_integration.tune_relay(mod, params, target, work_dir, 1000) - -""" -Works: -Does not: -57de9e7f3d2711582368903ce95f08b91216b7b5 Mon Nov 28 21:28:37 2022 -0800 - -""" diff --git a/python/tvm/meta_schedule/schedule/cuda/__init__.py b/python/tvm/meta_schedule/schedule/cuda/__init__.py index 937a6e16a91b..ce79d15cc4b4 100644 --- a/python/tvm/meta_schedule/schedule/cuda/__init__.py +++ b/python/tvm/meta_schedule/schedule/cuda/__init__.py @@ -15,3 +15,5 @@ # specific language governing permissions and limitations # under the License. """Per-block schedule rules in MetaSchedule for target key 'cuda'""" + +from . import layout_transform diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py new file mode 100644 index 000000000000..e064121e8514 --- /dev/null +++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py @@ -0,0 +1,321 @@ +import tvm +from tvm import topi +import math +from typing import List, Sequence, Tuple + +from tvm.tir.schedule import BlockRV, ExprRV, LoopRV +from collections import deque + + +def tile_layout_transform( + sch: tvm.tir.Schedule, + block_write: BlockRV, + src_layout: str, + dst_layout: str, + input_shape: List[int], + tile_size: ExprRV, +): + """ + High level tiling for layout transform block. + """ + + ## Tiling layout transforms: + # Assume we have an input shape of [A, B, C, D] and want to layout transform + # ABCD --> DBAC so the output shape would be [D, B, A, C]. + # + # Consider reading from the input buffer in a cache-friendly fashion on CPU. We would + # expect a loop structure like: + # lAr, lBr, lCr, lDr = T.grid(A, B, C, D) + # + # Meanwhile consider writing to the output buffer in a cache-friendly fashion on CPU: + # lDw, lBw, lAw, lCw = T.grid(D, B, A, C) + # + # Clearly in many scenarios it is impossible to guarantee contiguous writes and reads + # within a single loop. Due to non-adjacent dimensions. Instead we work on transposing some + # small sub-tensor of our input writing and then reading from shared memory. We must now + # construct our submatrix so that reading and writing can both be done with some contiguous + # access in global memory. + # + # Consider the case of a 2D transpose. For example [1024, 2048] -> [2048, 1024]. + # We note that if we deal with a submatrix of shape [32, 32] which corresponds + # to the dimension of our input tensor, then rows of the submatrix are contiguous + # in the input tensor. Meanwhile, columns of our submatrix are contiguous in our + # output vector. Therefore, with this tile shape we have opportunity to read + # contiguously in our input tensor and write to shared memory, and write contiguously + # to our output tensor. + # + # The multiple dimensional case has a similar analogue. We want to allocate shared + # memory per block of [`tile_size`, `tile_size`]. We want the inner most dimension + # of our shared memory to correspond to contiguous reads from the input tensor and + # the outer dimension to correspond to contiguous writes into the output tensor. + # + # In terms of the loop structure reading from the input tensor, the inner most loops + # of our tile must correspond to the inner most dimensions of the input shape, + # while the outer dimensions correspond to the inner most dimensions of the output shape. + # To obtain an inner tile with this loop structure we factor out a contiguous `tile_size` + # chunk of our loop in the shape of interest. + # + # An example is probably best to show this idea: + # Let's say we want a layout transform of ABCD --> DCAB. With shape + # [1024_a, 2_b, 32_c, 8_d] --> [8_d, 32_c, 1024_a, 2_b] + # + # And tile size 32. + # + # Then we initially have a coalesced-read loop pattern of: + # T.grid(1024_a, 2_b, 32_c, 8_d) + # + # To obtain an inner tile of 32, we factor 4 from 32_c and 8 from 8_d: + # T.grid(1024_a, 2_b, 8_c1, 1_d1, 4_c2t, 8_d2t) + # T.grid(1024_a, 2_b, 8_cr, 1_dr, 32_dim1) + # + # To obtain an outer tile of 32, we factor from B then A to follow contiguous write + # pattern: + # + # T.grid(64_a1, 1_b1, 8_cr, 1_dr, 16_a2t, 2_b2t, 32_dim1) + # T.grid(64_ar, 1_br, 8_cr, 1_dr, 32_dim0, 32_dim1) + # + # Which allows us to read a tile with our wanted properties. + # For writing we use the existing analysis infrastructure to generate the proper structure for writing. + + def pad_dimension_to_at_least_number(loop: LoopRV, requested_size: int): + """E.g. if loop has extant of 8 but we want 10, returns size 10 loop with padding.""" + l1, l2 = sch.split(loop, [None, requested_size]) + return sch.fuse(l1, l2) + + def pad_dimension_to_factor_of_tile_size( + loop: LoopRV, initial_size: int, tile_size: int = tile_size + ) -> Tuple[LoopRV, int]: + """ + Pads loop of given size until it is divisble into tile_size. + If the given size of the loop is greater than tile size. Do not pad. + + example, loop_size = 5, tile_size = 32. loop_size --> 8 + loop_size = 5, tile_size = 36. loop_size --> 6 + loop_size = 8, tile_size = 32. loop_size --> 8 + loop_size = 33, tile_size = 32. loop_size --> 33 + + Returns padded loopRV and the new size + """ + if tile_size % initial_size == 0: + return loop, int(initial_size) + + if initial_size > tile_size or initial_size == tile_size: + return loop, int(initial_size) + + # if initial_size > tile_size return without change, factor = 1 + size = initial_size + while (tile_size % size) % tile_size > 0: + size += 1 + + return pad_dimension_to_at_least_number(loop, size), int(size) + + def spin_out_factor( + loops: List[LoopRV], loop_extants: List[int], index: int, factor_needed: int + ) -> Tuple[List[LoopRV], List[int], int]: + """ + Factor out loop dimensions to reach the requested factor. Updates the schedule in-place. + + E.g. say we want to factors which eventually multiply to 32 (factor_needed). + + Say we have the index we chose is a loop with an extant of 8. + E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed = 32, index = 3 + - 8 divides into 32 so we just split up the loop into two loops with extants 1 and 8. + - we then keep the 1-loop in place and move the new 8-loop to back of the list of loops + - ending loops / loop_extants = [3, 32, 6, 1, 8], remaining_factor_needed = 32 / 8 = 4 + + E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed=32, index = 0 + - 3 does not divide 32, so we pad until the extant divides 32, e.g. 4 + - we then split up the loop into extants 1 and 4, moving the 4 to the back + - ending loops / loop_extants = [1, 32, 6, 8, 4], remaining_factor_needed = 32 / 4 = 8 + + E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed=5, index = 3 + - 8 is larger than 5 so we immediately do the splitting routine. + - the 8 extant loop becomes loops with extants 2 and 5 + - ending loops / loop_extants = [1, 32, 6, 2, 5], remaining_factor_needed = 5 / 5 = 1 + + After updating loop ordering in place, returns the new list of loops, extants, and the + remaining factor needed. + """ + cur_loop = loops[index] + cur_extant = loop_extants[index] + + # Pad loops to divide evenly for factors needed, and split + new_loop, new_size = pad_dimension_to_factor_of_tile_size( + cur_loop, cur_extant, tile_size=factor_needed + ) + + split_factor = min(new_size, factor_needed) + new_loop_split, factored_loop = sch.split(new_loop, [None, split_factor]) + factor_needed = factor_needed // split_factor + + # update caching + loops[index] = new_loop_split + loops.append(factored_loop) + + loop_extants[index] = math.ceil(new_size / split_factor) + loop_extants.append(split_factor) + + sch.reorder(*loops) + return loops, loop_extants, factor_needed + + def factor_dim_in_order( + indices: Sequence[int], + loops: List[LoopRV], + cur_loop_extants: List[int], + work_needed_inner_loop: int = tile_size, + ): + """Factors out the loops in the order of indices until we reach needed work. + + Adds new loop factors to the back in reverse order of access. + """ + for i in indices: + loops, cur_loop_extants, work_needed_inner_loop = spin_out_factor( + loops, cur_loop_extants, i, work_needed_inner_loop + ) + if work_needed_inner_loop == 1: + break + return loops, cur_loop_extants + + def get_high_level_loop_structure(block): + """Runs the factorization described above.""" + # index 0 ... rank - 1 will always correspond to original loops + # perhaps after they have been factored. + loops = sch.get_loops(block) + cur_loop_extants = list(input_shape) + + # Factor dim0 tile size and fuse things together + loops, cur_loop_extants = factor_dim_in_order( + range(rank - 1, -1, -1), + loops, + cur_loop_extants, + work_needed_inner_loop=tile_size, + ) + # The factors which multiply to tile_size are now in back of our + # list of loops. However because we added them by traversing the inner + # dimensions, they are actually reversed order to guarantee the best access + # so reorder so reorder before fusing. + loops = loops[:rank] + loops[rank:][::-1] + cur_loop_extants = cur_loop_extants[:rank] + cur_loop_extants[rank::-1] + sch.reorder(*loops) + dim0_loop_tiled = sch.fuse(*loops[rank:]) + loops = loops[:rank] + loops.append(dim0_loop_tiled) + cur_loop_extants = cur_loop_extants[:rank] + cur_loop_extants.append(tile_size) + + # Same thing with dim1 + # [:rank + 1], since we placed dim0_loop_tiled in the end which we want to keep + loops, cur_loop_extants = factor_dim_in_order( + ( + src_layout.index(dst_layout[loop_index_dst]) + for loop_index_dst in range(rank - 1, -1, -1) + ), + loops, + cur_loop_extants, + work_needed_inner_loop=tile_size, + ) + loops = loops[: rank + 1] + loops[rank + 1 :][::-1] + cur_loop_extants = cur_loop_extants[: rank + 1] + cur_loop_extants[rank + 1 :: -1] + sch.reorder(*loops) + dim1_loop_tiled = sch.fuse(*loops[rank + 1 :]) + loops = loops[: rank + 1] + loops.append(dim1_loop_tiled) + cur_loop_extants = cur_loop_extants[: rank + 1] + cur_loop_extants.append(tile_size) + + rank = len(src_layout) + + # Outer loop structure of read block matches that of src_layout + # E.g. if input_shape is [4, 6, 8]. Loops for read block will be + # for i, j, k in T.grid(4, 6, 8): + # ... + # Read block will read from global memory coalesced at the start + # Assume write to output global memory is coalesced in block_write + block_read = sch.cache_read(block_write, 0, "shared") + + # Here we have [loop1, loop2, loop3 ... dim0_tiled, dim1_tiled] + get_high_level_loop_structure(block_read) + loops = sch.get_loops(block_read) + + # If there are insufficient elements, than dim1_tiled or dim0_tiled might be too small + # In all likelihood you should use a smaller tile, but I don't want things to crash. + loops[-1] = pad_dimension_to_at_least_number(loops[-1], tile_size) + loops[-2] = pad_dimension_to_at_least_number(loops[-2], tile_size) + + # We want the dim0 and dim1 parent loops to be the inner most. Right now dim1 is inner-msot + # and we just need to move dim0 in (last dimension of dst). + # Recall right now structure is at least [l1 l2 ... ln, dim0_tiled, dim1_tiled] + # where n >= 2. + dim0_loop_index = src_layout.index(dst_layout[-1]) + dim0_loop = loops.pop(dim0_loop_index) + loops = loops[:-3] + [dim0_loop, loops[-3]] + loops[-2:] + sch.reorder(*loops) + + # After this: [outer_loop (block binding), dim0_tiled, dim1_tiled] + outer_loop = sch.fuse(*loops[:-2]) + + # Now that we have the high level loop structure, we can use reverse_compute_at magic + # To get the proper loop structure for writing! This is also as coalesced as possible + # already. + sch.reverse_compute_at(block_write, outer_loop) + + # Fuse all inner loops for the write into 2 loops, grab inner loops for both read + # and write block which have locality (we will bind these to threadIdx) + fused_write_loop = sch.fuse(*sch.get_loops(block_write)[1:]) + _, inner_write_loop = sch.split(fused_write_loop, [None, tile_size]) + inner_read_loop = sch.get_loops(block_read)[-2] + + sch.bind(loop=outer_loop, thread_axis="blockIdx.x") + sch.bind(loop=inner_write_loop, thread_axis="threadIdx.x") + sch.bind(loop=inner_read_loop, thread_axis="threadIdx.x") + +def auto_inline(start_block): + # Autoinlines given block into consumers, and repeats process for consumer of block + # Done by default for injective schedules. + fringe = deque([start_block]) + visited = set() + while len(fringe) > 0: + cur_block = fringe.popleft() + if cur_block in visited: + continue + else: + visited.add(cur_block) + + consumer_blocks = sch.get_consumers(cur_block) + if len(consumer_blocks) >= 1: + fringe.extend(consumer_blocks) + sch.compute_inline(cur_block) + else: + # Found output block, no more inlining needed + return cur_block + + +@tvm.register_func("meta_schedule.cuda.layout_transform") +def cuda_layout_transform_schedule_rule(sch, block): + # params: input_buffer, output_buffer + params = sch.mod["main"].params + input_buffer = sch.mod["main"].buffer_map[params[0]] + output_buffer = sch.mod["main"].buffer_map[params[1]] + + # Info needed for tiling + input_shape = [int(dim) for dim in input_buffer.shape] + output_shape = [int(dim) for dim in output_buffer.shape] + src_layout = sch.get_sref(block).stmt.annotations["src_layout"] + dst_layout = sch.get_sref(block).stmt.annotations["dst_layout"] + + # For each schedule we also want to inline each stage as would be done in normal circumstances + # to prevent extraneous memory access. + block = auto_inline(block) + + schedules = [] + + # Tile size 2,3,4...64 as tile size of 1 has no coaslescing. + for tile_size in range(2, 65): + cur_sch = sch.copy() + tile_layout_transform(cur_sch, block, src_layout, dst_layout, input_shape, tile_size) + schedules.append(cur_sch) + + # Also include the default schedules which will be handled via AutoBind schedule rule + schedules.append(sch) + + return schedules diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index 6c600534cd28..6111ceed0f98 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -1397,6 +1397,7 @@ def dft_strategy_cuda(attrs, inputs, out_type, target): ) return strategy + @layout_transform_strategy.register(["cuda", "gpu"]) def layout_transform_strategy(attrs, inputs, out_type, target): strategy = _op.OpStrategy() @@ -1406,287 +1407,3 @@ def layout_transform_strategy(attrs, inputs, out_type, target): name="layout_transform.cuda", ) return strategy - - -import tvm - -@tvm.register_func("meta_schedule.cuda.layout_transform") -def cuda_layout_transform_schedule_rule(sch, block): - # params: input_buffer, output_buffer - params = sch.mod["main"].params - input_buffer = sch.mod["main"].buffer_map[params[0]] - output_buffer = sch.mod["main"].buffer_map[params[1]] - - input_shape = [int(dim) for dim in input_buffer.shape] - output_shape = [int(dim) for dim in output_buffer.shape] - - src_layout = sch.get_sref(block).stmt.annotations["src_layout"] - dst_layout = sch.get_sref(block).stmt.annotations["dst_layout"] - - import math - from typing import List, Sequence, Tuple - - from tvm.tir.schedule import BlockRV, ExprRV, LoopRV - - def schedule_layout_transform_v4( - sch: tvm.tir.Schedule, - block_write: BlockRV, - src_layout: str, - dst_layout: str, - input_shape: List[int], - tile_size: ExprRV, - ): - ## Tiling block_read - # Let N and M represent the dimensions of interest - # N and M are the last dim of src_layout and dst_layout respectively. - # Then the initial block's loop will look like - # [i1, i2 ... M ... j1, j2 ... N] - # - # To guarantee contiguous read for N, we must group reads - # so that loops which factor N, j_n, j_n-1 ... are the innermost dimension - # Therefore our strategy for guaranteeing contiguous writes for N is by - # continually splitting inner-most dimension in order of N, j_n, j_n-1... - # by factors which divide into tile_size until we have the final tile_size. - # e.g. if tile_size = 32. N = 2, j_n = 2, j_n-1 = 4, j_n-2 = 24 - # Then by combining N...j_n-1 we get a factor of 16, by spliiting up j_n-2 into - # two loops of 2 and 12, we can combine with the new loop of factor 2 to get a - # factor of 32. Note things don't divide evenly often time so we may have to pad - # to properly factorize. - # - # Similarly with M, to have contiguous writes we must consider the dst_layout: - # [a1, a2 ... N ... b1, b2 ... M] - # So that loops which factor M, b_m, b_m-1 ... are the innermost dimension - # Note that the dimension b_m, b_m-1 and j_n, j_n-1 may refer to the same dimension! - # By factoring j_n, j_n-1 we may be contributing to the innermost write dimension for - # M. However we note that both reads and writes must ideally have the same amount of - # work per thread for layout transforms so we still must build out to factors up - # to tile_size. - - def pad_dimension_to_at_least_number(loop: LoopRV, requested_size: int): - """E.g. if loop has extant of 8 but we want 10, returns size 10 loop with padding""" - l1, l2 = sch.split(loop, [None, requested_size]) - return sch.fuse(l1, l2) - - def pad_dimension_to_factor_of_tile_size( - loop: LoopRV, initial_size: int, tile_size: int = tile_size - ) -> Tuple[LoopRV, int]: - """ - Pads loop of given size until it is divisble into tile_size. - If the given size of the loop is greater than tile size. Do not pad. - - example, loop_size = 5, tile_size = 32. loop_size --> 8 - loop_size = 5, tile_size = 36. loop_size --> 6 - loop_size = 8, tile_size = 32. loop_size --> 8 - loop_size = 33, tile_size = 32. loop_size --> 33 - - Returns padded loopRV and the new size - """ - if tile_size % initial_size == 0: - return loop, int(initial_size) - - if initial_size > tile_size or initial_size == tile_size: - return loop, int(initial_size) - - # if initial_size > tile_size return without change, factor = 1 - size = initial_size - while (tile_size % size) % tile_size > 0: - size += 1 - - return pad_dimension_to_at_least_number(loop, size), int(size) - - def spin_out_factor( - loops: List[LoopRV], loop_extants: List[int], index: int, factor_needed: int - ) -> Tuple[List[LoopRV], List[int], int]: - """ - Factor out loop extant to reach the requested factor. Updates the schedule in-place. - - E.g. say we want to factors which eventually multiply to 32 (factor_needed). - - Say we have the index we chose is a loop with an extant of 8. - E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed = 32, index = 3 - - 8 divides into 32 so we just split up the loop into two loops with extants 1 and 8. - - we then keep the 1-loop in place and move the new 8-loop to back of the list of loops - - ending loops / loop_extants = [3, 32, 6, 1, 8], remaining_factor_needed = 32 / 8 = 4 - - E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed=32, index = 0 - - 3 does not divide 32, so we pad until the extant divides 32, e.g. 4 - - we then split up the loop into extants 1 and 4, moving the 4 to the back - - ending loops / loop_extants = [1, 32, 6, 8, 4], remaining_factor_needed = 32 / 4 = 8 - - E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed=5, index = 3 - - 8 is larger than 5 so we immediately do the splitting routine. - - the 8 extant loop becomes loops with extants 2 and 5 - - ending loops / loop_extants = [1, 32, 6, 2, 5], remaining_factor_needed = 5 / 5 = 1 - - After updating loop ordering in place, returns the new list of loops, extants, and the - remaining factor needed. - """ - cur_loop = loops[index] - cur_extant = loop_extants[index] - - # Pad loops to divide evenly for factors needed, and split - new_loop, new_size = pad_dimension_to_factor_of_tile_size( - cur_loop, cur_extant, tile_size=factor_needed - ) - - split_factor = min(new_size, factor_needed) - new_loop_split, factored_loop = sch.split(new_loop, [None, split_factor]) - factor_needed = factor_needed // split_factor - - # update caching - loops[index] = new_loop_split - loops.append(factored_loop) - - loop_extants[index] = math.ceil(new_size / split_factor) - loop_extants.append(split_factor) - - sch.reorder(*loops) - return loops, loop_extants, factor_needed - - def factor_dim_in_order( - indices: Sequence[int], - loops: List[LoopRV], - cur_loop_extants: List[int], - work_needed_inner_loop: int = tile_size, - ): - """TODO""" - for i in indices: - loops, cur_loop_extants, work_needed_inner_loop = spin_out_factor( - loops, cur_loop_extants, i, work_needed_inner_loop - ) - if work_needed_inner_loop == 1: - break - return loops, cur_loop_extants - - def get_high_level_loop_structure(block): - """Runs the factorization described above.""" - # index 0 ... rank - 1 will always correspond to original loops - # perhaps after they have been factored. - loops = sch.get_loops(block) - cur_loop_extants = list(input_shape) - - # Factor dim0 tile size and fuse things together - loops, cur_loop_extants = factor_dim_in_order( - range(rank - 1, -1, -1), - loops, - cur_loop_extants, - work_needed_inner_loop=tile_size, - ) - # The factors which multiply to tile_size are now in back of our - # list of loops. However because we added them by traversing the inner - # dimensions, they are actually reversed order to guarantee the best access - # so reorder so reorder before fusing. - loops = loops[:rank] + loops[rank:][::-1] - cur_loop_extants = cur_loop_extants[:rank] + cur_loop_extants[rank::-1] - sch.reorder(*loops) - dim0_loop_tiled = sch.fuse(*loops[rank:]) - loops = loops[:rank] - loops.append(dim0_loop_tiled) - cur_loop_extants = cur_loop_extants[:rank] - cur_loop_extants.append(tile_size) - - # Same thing with dim1 - # [:rank + 1], since we placed dim0_loop_tiled in the end which we want to keep - loops, cur_loop_extants = factor_dim_in_order( - ( - src_layout.index(dst_layout[loop_index_dst]) - for loop_index_dst in range(rank - 1, -1, -1) - ), - loops, - cur_loop_extants, - work_needed_inner_loop=tile_size, - ) - loops = loops[: rank + 1] + loops[rank + 1 :][::-1] - cur_loop_extants = cur_loop_extants[: rank + 1] + cur_loop_extants[rank + 1 :: -1] - sch.reorder(*loops) - dim1_loop_tiled = sch.fuse(*loops[rank + 1 :]) - loops = loops[: rank + 1] - loops.append(dim1_loop_tiled) - cur_loop_extants = cur_loop_extants[: rank + 1] - cur_loop_extants.append(tile_size) - - rank = len(src_layout) - - # Outer loop structure of read block matches that of src_layout - # E.g. if input_shape is [4, 6, 8]. Loops for read block will be - # for i, j, k in T.grid(4, 6, 8): - # ... - # Read block will read from global memory coalesced at the start - # Assume write to output global memory is coalesced in block_write - block_read = sch.cache_read(block_write, 0, "shared") - - # Here we have [loop1, loop2, loop3 ... dim0_tiled, dim1_tiled] - get_high_level_loop_structure(block_read) - - loops = sch.get_loops(block_read) - - # If there are insufficient elements, than dim1_tiled or dim0_tiled might be too small - # In all likelihood you should use a smaller tile, but I don't want things to crash. - loops[-1] = pad_dimension_to_at_least_number(loops[-1], tile_size) - loops[-2] = pad_dimension_to_at_least_number(loops[-2], tile_size) - - # We want the dim0 and dim1 parent loops to be the inner most. Right now dim1 is inner-msot - # and we just need to move dim0 in (last dimension of dst). - # Recall right now structure is at least [l1 l2 ... ln, dim0_tiled, dim1_tiled] - # where n >= 2. - dim0_loop_index = src_layout.index(dst_layout[-1]) - dim0_loop = loops.pop(dim0_loop_index) - loops = loops[:-3] + [dim0_loop, loops[-3]] + loops[-2:] - sch.reorder(*loops) - - # After this: [outer_loop (block), dim0_tiled, dim1_tiled] - outer_loop = sch.fuse(*loops[:-2]) - - # Now that we have the high level loop structure, we can use reverse_compute_at magic - # To get the proper loop structure for writing! This is also as coalesced as possible - # already. - sch.reverse_compute_at(block_write, outer_loop) - - # Fuse all inner loops for the write into 2 loops, grab inner loops for both read - # and write block which have locality (we will bind these to threadIdx) - fused_write_loop = sch.fuse(*sch.get_loops(block_write)[1:]) - _, inner_write_loop = sch.split(fused_write_loop, [None, tile_size]) - inner_read_loop = sch.get_loops(block_read)[-2] - - sch.bind(loop=outer_loop, thread_axis="blockIdx.x") - sch.bind(loop=inner_write_loop, thread_axis="threadIdx.x") - sch.bind(loop=inner_read_loop, thread_axis="threadIdx.x") - - from collections import deque - def auto_inline(start_block): - # BFS from start block in a chain (no branches) - fringe = deque([start_block]) - visited = set() - while len(fringe) > 0: - cur_block = fringe.popleft() - if cur_block in visited: - continue - else: - visited.add(cur_block) - - consumer_blocks = sch.get_consumers(cur_block) - - if len(consumer_blocks) >= 1: - fringe.extend(consumer_blocks) - sch.compute_inline(cur_block) - else: - # consumer yay! - return cur_block - - schedules = [] - - # For each schedule we also want to inline each stage as would be done in normal circumstances - # The block which producers the layout transform block seems - block = auto_inline(block) - - # Tile size 2,3,4...64 - # Tile size of 1 does not make sense... - for tile_size in range(2, 65): - cur_sch = sch.copy() - schedule_layout_transform_v4(cur_sch, block, src_layout, dst_layout, input_shape, tile_size) - schedules.append(cur_sch) - - # Also include the default schedules which will be handled via AutoBind schedule rule - schedules.append(sch) - - return schedules diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py index 2b29f5c28954..d2a189093292 100644 --- a/python/tvm/relay/op/strategy/generic.py +++ b/python/tvm/relay/op/strategy/generic.py @@ -21,8 +21,7 @@ from tvm import _ffi, ir, te, topi from tvm.target import generic_func, override_native_generic_func -from tvm.topi.utils import (get_const_float, get_const_int, get_const_tuple, - get_float_tuple) +from tvm.topi.utils import get_const_float, get_const_int, get_const_tuple, get_float_tuple from .. import op as _op @@ -2071,7 +2070,7 @@ def layout_transform_strategy(attrs, inputs, out_type, target): def wrap_compute_layout_transform(topi_compute, schedule_rule=""): - """Wrap stft compute""" + """Wrap layout transform compute""" def _compute_layout_transform(attrs, inputs, output_type): return [ diff --git a/python/tvm/topi/cuda/batch_matmul.py b/python/tvm/topi/cuda/batch_matmul.py index 829d2d68940d..d2f5c9b9c586 100644 --- a/python/tvm/topi/cuda/batch_matmul.py +++ b/python/tvm/topi/cuda/batch_matmul.py @@ -17,12 +17,12 @@ # pylint: disable=invalid-name,too-many-locals,unused-variable,unused-argument """cuda batch_matmul operators""" import tvm -from tvm import autotvm, te -from tvm.autotvm.task.space import OtherOptionEntity, SplitEntity +from tvm import autotvm +from tvm import te from tvm.contrib import cublas - -from .. import generic, nn -from ..utils import get_const_tuple, get_max_power2_factor, traverse_inline +from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity +from .. import nn, generic +from ..utils import traverse_inline, get_const_tuple, get_max_power2_factor from .tensor_intrin import dp4a @@ -87,7 +87,6 @@ def schedule_batch_matmul(cfg, outs): s: Schedule The computation schedule for the op. """ - breakpoint() outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs s = te.create_schedule([x.op for x in outs]) diff --git a/python/tvm/topi/x86/new.py b/python/tvm/topi/x86/new.py deleted file mode 100644 index e611b5a84a51..000000000000 --- a/python/tvm/topi/x86/new.py +++ /dev/null @@ -1,12 +0,0 @@ -import tensorflow as tf - -indices = tf.constant([[1], [1]]) -updates = tf.constant( - [ - [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]], - [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]], - ] -) -shape = tf.constant([4, 4, 4]) -scatter = tf.scatter_nd(indices, updates, shape) -print(scatter) diff --git a/src/tir/schedule/primitive/compute_at.cc b/src/tir/schedule/primitive/compute_at.cc index e027cfdd5742..988c73c3f071 100644 --- a/src/tir/schedule/primitive/compute_at.cc +++ b/src/tir/schedule/primitive/compute_at.cc @@ -706,11 +706,6 @@ void ComputeAtOrReverseComputeAtImpl(ScheduleState self, const StmtSRef& block_s /*provided_regions=*/std::move(provided_regions), /*required_regions=*/std::move(required_regions), /*analyzer=*/analyzer); - - for (int i = 0; i < iter_doms.size(); i++) { - //LOG(WARNING) << i << " " << iter_doms[i].dom << " " << iter_doms[i].bound << std::endl; - } - // Step 6. Create the new scope according to the iteration domain reconstructor.MakeNewLoop(/*insert_position=*/insert_position, /*iter_doms=*/std::move(iter_doms), /*analyzer=*/analyzer, /*preserve_unit_loops=*/preserve_unit_loops); From ced49f4d1019c415e9ad779dcb531eec69330249 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Thu, 2 Mar 2023 17:25:56 -0800 Subject: [PATCH 05/40] forgot to forward arg --- python/tvm/meta_schedule/schedule/cuda/layout_transform.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py index e064121e8514..47413b161619 100644 --- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py +++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py @@ -269,7 +269,7 @@ def get_high_level_loop_structure(block): sch.bind(loop=inner_write_loop, thread_axis="threadIdx.x") sch.bind(loop=inner_read_loop, thread_axis="threadIdx.x") -def auto_inline(start_block): +def auto_inline(sch, start_block): # Autoinlines given block into consumers, and repeats process for consumer of block # Done by default for injective schedules. fringe = deque([start_block]) @@ -305,7 +305,7 @@ def cuda_layout_transform_schedule_rule(sch, block): # For each schedule we also want to inline each stage as would be done in normal circumstances # to prevent extraneous memory access. - block = auto_inline(block) + block = auto_inline(sch, block) schedules = [] From c6e87390a117f0e1c21ae9ce3a5bd662f2aa2271 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Fri, 3 Mar 2023 13:09:24 -0800 Subject: [PATCH 06/40] fix tests --- python/tvm/meta_schedule/schedule/cuda/layout_transform.py | 6 +++--- python/tvm/relay/op/strategy/generic.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py index 47413b161619..d2e11994f5eb 100644 --- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py +++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py @@ -309,13 +309,13 @@ def cuda_layout_transform_schedule_rule(sch, block): schedules = [] + # Always include the default schedules which will be handled via AutoBind schedule rule + schedules.append(sch) + # Tile size 2,3,4...64 as tile size of 1 has no coaslescing. for tile_size in range(2, 65): cur_sch = sch.copy() tile_layout_transform(cur_sch, block, src_layout, dst_layout, input_shape, tile_size) schedules.append(cur_sch) - # Also include the default schedules which will be handled via AutoBind schedule rule - schedules.append(sch) - return schedules diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py index d2a189093292..d53cd045383b 100644 --- a/python/tvm/relay/op/strategy/generic.py +++ b/python/tvm/relay/op/strategy/generic.py @@ -2063,7 +2063,8 @@ def layout_transform_strategy(attrs, inputs, out_type, target): strategy = _op.OpStrategy() strategy.add_implementation( wrap_compute_layout_transform(topi.layout_transform), - wrap_topi_schedule(topi.generic.schedule_injective), + # Defined earlier in the file + schedule_injective, name="layout_transform.generic", ) return strategy From f18f933180d63f76221ed4d6e5e634362107b830 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Fri, 3 Mar 2023 13:15:59 -0800 Subject: [PATCH 07/40] reduce search space --- python/tvm/meta_schedule/schedule/cuda/layout_transform.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py index d2e11994f5eb..ac881059f6e2 100644 --- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py +++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py @@ -312,8 +312,8 @@ def cuda_layout_transform_schedule_rule(sch, block): # Always include the default schedules which will be handled via AutoBind schedule rule schedules.append(sch) - # Tile size 2,3,4...64 as tile size of 1 has no coaslescing. - for tile_size in range(2, 65): + # Tile size 2,3,4...32 as tile size of 1 has no coaslescing. + for tile_size in range(2, 33): cur_sch = sch.copy() tile_layout_transform(cur_sch, block, src_layout, dst_layout, input_shape, tile_size) schedules.append(cur_sch) From 5a51ffa078402cc3f938b4c34a546df127d9e1e1 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Fri, 3 Mar 2023 13:22:50 -0800 Subject: [PATCH 08/40] lint --- .../schedule/cuda/layout_transform.py | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py index ac881059f6e2..6d9545474a12 100644 --- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py +++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py @@ -18,7 +18,7 @@ def tile_layout_transform( """ High level tiling for layout transform block. """ - + ## Tiling layout transforms: # Assume we have an input shape of [A, B, C, D] and want to layout transform # ABCD --> DBAC so the output shape would be [D, B, A, C]. @@ -31,15 +31,15 @@ def tile_layout_transform( # lDw, lBw, lAw, lCw = T.grid(D, B, A, C) # # Clearly in many scenarios it is impossible to guarantee contiguous writes and reads - # within a single loop. Due to non-adjacent dimensions. Instead we work on transposing some - # small sub-tensor of our input writing and then reading from shared memory. We must now - # construct our submatrix so that reading and writing can both be done with some contiguous + # within a single loop. Due to non-adjacent dimensions. Instead we work on transposing some + # small sub-tensor of our input writing and then reading from shared memory. We must now + # construct our submatrix so that reading and writing can both be done with some contiguous # access in global memory. # # Consider the case of a 2D transpose. For example [1024, 2048] -> [2048, 1024]. - # We note that if we deal with a submatrix of shape [32, 32] which corresponds + # We note that if we deal with a submatrix of shape [32, 32] which corresponds # to the dimension of our input tensor, then rows of the submatrix are contiguous - # in the input tensor. Meanwhile, columns of our submatrix are contiguous in our + # in the input tensor. Meanwhile, columns of our submatrix are contiguous in our # output vector. Therefore, with this tile shape we have opportunity to read # contiguously in our input tensor and write to shared memory, and write contiguously # to our output tensor. @@ -48,13 +48,13 @@ def tile_layout_transform( # memory per block of [`tile_size`, `tile_size`]. We want the inner most dimension # of our shared memory to correspond to contiguous reads from the input tensor and # the outer dimension to correspond to contiguous writes into the output tensor. - # + # # In terms of the loop structure reading from the input tensor, the inner most loops - # of our tile must correspond to the inner most dimensions of the input shape, + # of our tile must correspond to the inner most dimensions of the input shape, # while the outer dimensions correspond to the inner most dimensions of the output shape. # To obtain an inner tile with this loop structure we factor out a contiguous `tile_size` - # chunk of our loop in the shape of interest. - # + # chunk of our loop in the shape of interest. + # # An example is probably best to show this idea: # Let's say we want a layout transform of ABCD --> DCAB. With shape # [1024_a, 2_b, 32_c, 8_d] --> [8_d, 32_c, 1024_a, 2_b] @@ -63,12 +63,12 @@ def tile_layout_transform( # # Then we initially have a coalesced-read loop pattern of: # T.grid(1024_a, 2_b, 32_c, 8_d) - # + # # To obtain an inner tile of 32, we factor 4 from 32_c and 8 from 8_d: # T.grid(1024_a, 2_b, 8_c1, 1_d1, 4_c2t, 8_d2t) # T.grid(1024_a, 2_b, 8_cr, 1_dr, 32_dim1) # - # To obtain an outer tile of 32, we factor from B then A to follow contiguous write + # To obtain an outer tile of 32, we factor from B then A to follow contiguous write # pattern: # # T.grid(64_a1, 1_b1, 8_cr, 1_dr, 16_a2t, 2_b2t, 32_dim1) @@ -165,7 +165,7 @@ def factor_dim_in_order( work_needed_inner_loop: int = tile_size, ): """Factors out the loops in the order of indices until we reach needed work. - + Adds new loop factors to the back in reverse order of access. """ for i in indices: @@ -269,6 +269,7 @@ def get_high_level_loop_structure(block): sch.bind(loop=inner_write_loop, thread_axis="threadIdx.x") sch.bind(loop=inner_read_loop, thread_axis="threadIdx.x") + def auto_inline(sch, start_block): # Autoinlines given block into consumers, and repeats process for consumer of block # Done by default for injective schedules. @@ -296,7 +297,7 @@ def cuda_layout_transform_schedule_rule(sch, block): params = sch.mod["main"].params input_buffer = sch.mod["main"].buffer_map[params[0]] output_buffer = sch.mod["main"].buffer_map[params[1]] - + # Info needed for tiling input_shape = [int(dim) for dim in input_buffer.shape] output_shape = [int(dim) for dim in output_buffer.shape] From e1ce901018cb2303e8406ce0da6e7f490da0c037 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Wed, 8 Mar 2023 13:42:32 -0800 Subject: [PATCH 09/40] schedule rule documentation --- include/tvm/topi/transform.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h index 0d0c5d5962ce..ff57f99075ff 100644 --- a/include/tvm/topi/transform.h +++ b/include/tvm/topi/transform.h @@ -1592,6 +1592,7 @@ inline Array meshgrid(const Array& inputs, const std::string& in * \param dst_layout the destination layout. * \param name output tensor name. * \param tag output tensor tag. + * \param schedule_rule name of specialized schedule rule to use. * \return A tensor with shape in \p dst_layout */ inline Tensor layout_transform(const Tensor& src, const std::string& src_layout, From 5f0b1b00f98b9359a8ba06e566d869f931ba0284 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Wed, 8 Mar 2023 15:02:46 -0800 Subject: [PATCH 10/40] add a note --- .../postproc/rewrite_cooperative_fetch.cc | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc index 353b90c36423..f0e7e4eccfa1 100644 --- a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc +++ b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc @@ -39,7 +39,17 @@ Optional ParseThreadBinding(const Schedule& sch, const Instruction& ins if (thread_axis != axis) { return NullOpt; } - return Downcast(sch->Get(Downcast(inst->inputs[0]))->extent); + + try { + return Downcast(sch->Get(Downcast(inst->inputs[0]))->extent); + } catch (const std::exception& e) { + // This can occur if in a schedule we manually bind threads in the middle of a schedule + // and then later modify the schedule. As the passed in schedule is after running the entire trace + // the bound loop may be moved around in the IRModule. + // TODO: apply trace one inst at a time in the schedule so schedule state is always accurate to instruction + LOG(WARNING) << "Failed to calculate extent so skipping RewriteCooperativeFetching. Error " << e.what(); + return NullOpt; + } } /*! From 826a8772a7f1933c5c260989a5624a25e4e5f17a Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Wed, 8 Mar 2023 15:08:43 -0800 Subject: [PATCH 11/40] fix wording --- src/meta_schedule/postproc/rewrite_cooperative_fetch.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc index f0e7e4eccfa1..7e314a50a97b 100644 --- a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc +++ b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc @@ -46,7 +46,7 @@ Optional ParseThreadBinding(const Schedule& sch, const Instruction& ins // This can occur if in a schedule we manually bind threads in the middle of a schedule // and then later modify the schedule. As the passed in schedule is after running the entire trace // the bound loop may be moved around in the IRModule. - // TODO: apply trace one inst at a time in the schedule so schedule state is always accurate to instruction + // TODO: apply trace one inst at a time so schedule state is always accurate to instruction LOG(WARNING) << "Failed to calculate extent so skipping RewriteCooperativeFetching. Error " << e.what(); return NullOpt; } From d11e66e7599a133690333901b939ac7a20c8413b Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Mon, 13 Mar 2023 13:54:48 -0700 Subject: [PATCH 12/40] handle implicit reshape case v1 --- .../schedule/cuda/layout_transform.py | 117 ++++++++++++++++-- 1 file changed, 106 insertions(+), 11 deletions(-) diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py index 6d9545474a12..84bb696bb088 100644 --- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py +++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py @@ -1,10 +1,10 @@ -import tvm -from tvm import topi import math -from typing import List, Sequence, Tuple +from collections import deque +from typing import List, Sequence, Tuple, Union +import tvm +from tvm import topi from tvm.tir.schedule import BlockRV, ExprRV, LoopRV -from collections import deque def tile_layout_transform( @@ -176,11 +176,88 @@ def factor_dim_in_order( break return loops, cur_loop_extants - def get_high_level_loop_structure(block): + # Layout transform allows semantics like NCHW --> NCHW4c + # Which involves splitting the original C axis into contiguous 4-element chunks + # This axis is then moved to the end (NCHWc) + # To handle this we just split the associating axis (prev. type checking ensures C is divisible by 4) + # And then the layout is just NCcHW --> NCHW4c + # input_shape, src_layout, dst_layout = handle_split_case(sch) + # Note: NCHW4c --> NCHW is not allowed, so the only numeric digits will be in dst + def handle_block_implicit_reshape( + block_read, orig_input_shape, orig_src_layout, orig_dst_layout + ) -> Tuple[List[int], str, str]: + # Each loop should match src_layout dimension extants + loops = sch.get_loops(block_read) + + # Figure out split dimensions, entries are (loop index in src_layout, split dimension) + split_dimensions: List[Tuple[int, int]] = [] + + # This is without numeric digits, e.g. NCHW4c -> NCHWc + new_dst_layout = [] + + # Use state machine to parse NCHW4c string + split_size = 0 + for char in orig_dst_layout: + if char.isnumeric(): + split_size = split_size * 10 + int(char) + else: + if char.islower(): + # hit axis like 'c', need to find parent axis 'C' in src_layout + src_layout_index = orig_src_layout.index(char.upper()) + split_dimensions.append((src_layout_index, split_size)) + split_size = 0 + new_dst_layout.append(char) + + # Calculate final input shapes, each of these are a single element for unsplit dims + # and tuples for split dims associated with the two new axis + input_shape: List[Union[int, Tuple]] = [i for i in orig_input_shape] + new_src_layout: List[Union[str, Tuple]] = [c for c in orig_src_layout] + for src_layout_split_index, split_factor in split_dimensions: + dimension_name = new_src_layout[src_layout_split_index] + new_src_layout[src_layout_split_index] = (dimension_name, dimension_name.lower()) + + remain_factor = input_shape[src_layout_split_index] // split_factor + input_shape[src_layout_split_index] = (remain_factor, split_factor) + + sch.split(loops[src_layout_split_index], [remain_factor, split_factor]) + + # Finally to help analyzer, make layout match that of output + def index_map(*loop_indices): + answer = [] + assert len(loop_indices) == len(input_shape) + for dim, input_shape_solved in zip(loop_indices, input_shape): + if isinstance(input_shape_solved, tuple): + (_, split_factor) = input_shape_solved + answer.extend((dim // split_factor, dim % split_factor)) + else: + answer.append(dim) + return tuple(answer) + + sch.transform_layout( + block_read, buffer=("write", 0), index_map=index_map, assume_injective_transform=True + ) + + # Unpack any tuples introduced via appending + def unpack_list(target_list) -> list: + output = [] + for ele in target_list: + if isinstance(ele, tuple): + output.extend(ele) + else: + output.append(ele) + return output + + new_src_layout = unpack_list(new_src_layout) + new_src_layout = "".join(new_src_layout) + new_dst_layout = "".join(new_dst_layout) + return unpack_list(input_shape), new_src_layout, new_dst_layout + + def get_high_level_loop_structure(block_read, input_shape, src_layout, dst_layout): """Runs the factorization described above.""" # index 0 ... rank - 1 will always correspond to original loops # perhaps after they have been factored. - loops = sch.get_loops(block) + rank = len(input_shape) + loops = sch.get_loops(block_read) cur_loop_extants = list(input_shape) # Factor dim0 tile size and fuse things together @@ -223,8 +300,6 @@ def get_high_level_loop_structure(block): cur_loop_extants = cur_loop_extants[: rank + 1] cur_loop_extants.append(tile_size) - rank = len(src_layout) - # Outer loop structure of read block matches that of src_layout # E.g. if input_shape is [4, 6, 8]. Loops for read block will be # for i, j, k in T.grid(4, 6, 8): @@ -233,8 +308,13 @@ def get_high_level_loop_structure(block): # Assume write to output global memory is coalesced in block_write block_read = sch.cache_read(block_write, 0, "shared") - # Here we have [loop1, loop2, loop3 ... dim0_tiled, dim1_tiled] - get_high_level_loop_structure(block_read) + # Grab final input shape and src and dst layouts. + input_shape, src_layout, dst_layout = handle_block_implicit_reshape( + block_read, input_shape, src_layout, dst_layout + ) + + # After this we have [loop1, loop2, loop3 ... dim0_tiled, dim1_tiled] + get_high_level_loop_structure(block_read, input_shape, src_layout, dst_layout) loops = sch.get_loops(block_read) # If there are insufficient elements, than dim1_tiled or dim0_tiled might be too small @@ -291,6 +371,18 @@ def auto_inline(sch, start_block): return cur_block +def handle_split_case(sch, input_shape, src_layout, dst_layout): + """ + Handle the case where the layout transform also reshapes some axis + e.g. NCHW --> NCHW4c + (split the original C axis into chunks of 4 contiguous elements, move this axis to end) + + Rewrites the schedule to handle this case and returns the input shape as well as + src/dst layouts to use for the later code. + """ + pass + + @tvm.register_func("meta_schedule.cuda.layout_transform") def cuda_layout_transform_schedule_rule(sch, block): # params: input_buffer, output_buffer @@ -313,7 +405,10 @@ def cuda_layout_transform_schedule_rule(sch, block): # Always include the default schedules which will be handled via AutoBind schedule rule schedules.append(sch) - # Tile size 2,3,4...32 as tile size of 1 has no coaslescing. + # Setup up basic structure of schedule before applying tiling + sch = sch.copy() + + # Try tile size 2,3,4...32 as tile size of 1 has no coaslescing. for tile_size in range(2, 33): cur_sch = sch.copy() tile_layout_transform(cur_sch, block, src_layout, dst_layout, input_shape, tile_size) From 0be1be0c2cafc02f83ccdc701e037ca62975b74f Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Mon, 13 Mar 2023 14:52:28 -0700 Subject: [PATCH 13/40] clean up comments --- .../schedule/cuda/layout_transform.py | 480 +++++++++++------- 1 file changed, 291 insertions(+), 189 deletions(-) diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py index 84bb696bb088..c3166abea85c 100644 --- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py +++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py @@ -6,9 +6,68 @@ from tvm import topi from tvm.tir.schedule import BlockRV, ExprRV, LoopRV +## Tiling layout transforms: +# Assume we have an input shape of [A, B, C, D] and want to layout transform +# ABCD --> DBAC so the output shape would be [D, B, A, C]. +# +# Consider reading from the input buffer in a cache-friendly fashion on CPU. We would +# expect a loop structure like: +# lAr, lBr, lCr, lDr = T.grid(A, B, C, D) +# +# Meanwhile consider writing to the output buffer in a cache-friendly fashion on CPU: +# lDw, lBw, lAw, lCw = T.grid(D, B, A, C) +# +# Clearly in many scenarios it is impossible to guarantee contiguous writes and reads +# within a single loop due to non-adjacent dimensions. Instead we work on transposing some +# small sub-tensor of our input writing and then reading from shared memory. We must now +# construct our submatrix so that reading and writing can both be done with some contiguous +# access in global memory. +# +# Consider the case of a 2D transpose. For example [1024, 2048] -> [2048, 1024]. +# We note that if we deal with a submatrix of shape [32, 32] which corresponds +# to the dimension of our input tensor, then rows of the submatrix are contiguous +# in the input tensor. Meanwhile, columns of our submatrix are contiguous in our +# output vector. Therefore, with this tile shape we have opportunity to read +# contiguously in our input tensor and write to shared memory, and write contiguously +# to our output tensor. +# +# The multiple dimensional case has a similar analogue. We want to allocate shared +# memory per block of [`tile_size`, `tile_size`]. We want the inner most dimension +# of our shared memory to correspond to contiguous reads from the input tensor and +# the outer dimension to correspond to contiguous writes into the output tensor. +# +# In terms of the loop structure reading from the input tensor, the inner most loops +# of our tile must correspond to the inner most dimensions of the input shape, +# while the outer dimensions correspond to the inner most dimensions of the output shape. +# To obtain an inner tile with this loop structure we factor out a contiguous `tile_size` +# chunk of our loop in the shape of interest. +# +# An example is probably best to show this idea: +# Let's say we want a layout transform of ABCD --> DCAB. With shape +# [1024_a, 2_b, 32_c, 8_d] --> [8_d, 32_c, 1024_a, 2_b] +# +# And tile size 32. +# +# Then we initially have a coalesced-read loop pattern of: +# T.grid(1024_a, 2_b, 32_c, 8_d) +# +# To obtain an inner tile of 32, we factor 4 from 32_c and 8 from 8_d: +# T.grid(1024_a, 2_b, 8_c1, 1_d1, 4_c2t, 8_d2t) +# T.grid(1024_a, 2_b, 8_cr, 1_dr, 32_dim1) +# +# To obtain an outer tile of 32, we factor from B then A to follow contiguous write +# pattern: +# +# T.grid(64_a1, 1_b1, 8_cr, 1_dr, 16_a2t, 2_b2t, 32_dim1) +# T.grid(64_ar, 1_br, 8_cr, 1_dr, 32_dim0, 32_dim1) +# +# Which allows us to read a tile with our wanted properties. +# For writing we use the existing analysis infrastructure to generate the proper structure for writing. + def tile_layout_transform( sch: tvm.tir.Schedule, + block_read: BlockRV, block_write: BlockRV, src_layout: str, dst_layout: str, @@ -16,66 +75,38 @@ def tile_layout_transform( tile_size: ExprRV, ): """ - High level tiling for layout transform block. - """ + High level tiling for layout transform block. Mutates sch in place. + + Parameters + ---------- + sch: + The initial schedule. We expect `block_read` and `block_write` to correspond to + the blocks which reads and writes from global memory respectively. We also expect + block_read's initial loops to follow + + block_read: + The block which reads from global memory and writes to shared memory buffer. + + block_write: + The block which writes to global memory and reads from shared memory buffer. + + src_layout : + The src_layout, each character should appear once and also appear in dst_layout. + There should be not numeric characters and refer to potentially implicit reshapes. + E.g. the transform NCHW --> NCHW4c really implies NCcHW --> NCHWc. In this case + src_layout should be NCcHW. - ## Tiling layout transforms: - # Assume we have an input shape of [A, B, C, D] and want to layout transform - # ABCD --> DBAC so the output shape would be [D, B, A, C]. - # - # Consider reading from the input buffer in a cache-friendly fashion on CPU. We would - # expect a loop structure like: - # lAr, lBr, lCr, lDr = T.grid(A, B, C, D) - # - # Meanwhile consider writing to the output buffer in a cache-friendly fashion on CPU: - # lDw, lBw, lAw, lCw = T.grid(D, B, A, C) - # - # Clearly in many scenarios it is impossible to guarantee contiguous writes and reads - # within a single loop. Due to non-adjacent dimensions. Instead we work on transposing some - # small sub-tensor of our input writing and then reading from shared memory. We must now - # construct our submatrix so that reading and writing can both be done with some contiguous - # access in global memory. - # - # Consider the case of a 2D transpose. For example [1024, 2048] -> [2048, 1024]. - # We note that if we deal with a submatrix of shape [32, 32] which corresponds - # to the dimension of our input tensor, then rows of the submatrix are contiguous - # in the input tensor. Meanwhile, columns of our submatrix are contiguous in our - # output vector. Therefore, with this tile shape we have opportunity to read - # contiguously in our input tensor and write to shared memory, and write contiguously - # to our output tensor. - # - # The multiple dimensional case has a similar analogue. We want to allocate shared - # memory per block of [`tile_size`, `tile_size`]. We want the inner most dimension - # of our shared memory to correspond to contiguous reads from the input tensor and - # the outer dimension to correspond to contiguous writes into the output tensor. - # - # In terms of the loop structure reading from the input tensor, the inner most loops - # of our tile must correspond to the inner most dimensions of the input shape, - # while the outer dimensions correspond to the inner most dimensions of the output shape. - # To obtain an inner tile with this loop structure we factor out a contiguous `tile_size` - # chunk of our loop in the shape of interest. - # - # An example is probably best to show this idea: - # Let's say we want a layout transform of ABCD --> DCAB. With shape - # [1024_a, 2_b, 32_c, 8_d] --> [8_d, 32_c, 1024_a, 2_b] - # - # And tile size 32. - # - # Then we initially have a coalesced-read loop pattern of: - # T.grid(1024_a, 2_b, 32_c, 8_d) - # - # To obtain an inner tile of 32, we factor 4 from 32_c and 8 from 8_d: - # T.grid(1024_a, 2_b, 8_c1, 1_d1, 4_c2t, 8_d2t) - # T.grid(1024_a, 2_b, 8_cr, 1_dr, 32_dim1) - # - # To obtain an outer tile of 32, we factor from B then A to follow contiguous write - # pattern: - # - # T.grid(64_a1, 1_b1, 8_cr, 1_dr, 16_a2t, 2_b2t, 32_dim1) - # T.grid(64_ar, 1_br, 8_cr, 1_dr, 32_dim0, 32_dim1) - # - # Which allows us to read a tile with our wanted properties. - # For writing we use the existing analysis infrastructure to generate the proper structure for writing. + dst_layout: + The dst_layout. There should not be numeric characters, e.g. NCHW4c becomes NCHWc. + + input_shape: + The input shape after applying potentially implicit reshapes. Should match the loop + extants corresponding to src_layout. + + tile_size: + The tile size of read and writes. There will be tile_size threads per block, each of which + reads up to tile_size elements. + """ def pad_dimension_to_at_least_number(loop: LoopRV, requested_size: int): """E.g. if loop has extant of 8 but we want 10, returns size 10 loop with padding.""" @@ -86,15 +117,16 @@ def pad_dimension_to_factor_of_tile_size( loop: LoopRV, initial_size: int, tile_size: int = tile_size ) -> Tuple[LoopRV, int]: """ - Pads loop of given size until it is divisble into tile_size. + Pads loop of given size until it is divisible into tile_size. If the given size of the loop is greater than tile size. Do not pad. - example, loop_size = 5, tile_size = 32. loop_size --> 8 - loop_size = 5, tile_size = 36. loop_size --> 6 - loop_size = 8, tile_size = 32. loop_size --> 8 - loop_size = 33, tile_size = 32. loop_size --> 33 + examples: + - loop_size = 5 , tile_size = 32. loop_size --> 8 + - loop_size = 5 , tile_size = 36. loop_size --> 6 + - loop_size = 8 , tile_size = 32. loop_size --> 8 : since 8 already divides 32. + - loop_size = 33, tile_size = 32. loop_size --> 33 : since 33 > 32. - Returns padded loopRV and the new size + Returns padded loopRV and the new size. """ if tile_size % initial_size == 0: return loop, int(initial_size) @@ -113,22 +145,25 @@ def spin_out_factor( loops: List[LoopRV], loop_extants: List[int], index: int, factor_needed: int ) -> Tuple[List[LoopRV], List[int], int]: """ - Factor out loop dimensions to reach the requested factor. Updates the schedule in-place. + Factor out the requested loop's dimensions to reach the requested factor and + places the requested factor as the innermost loop. + + Updates the schedule in-place. E.g. say we want to factors which eventually multiply to 32 (factor_needed). Say we have the index we chose is a loop with an extant of 8. - E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed = 32, index = 3 + E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed = 32, index=3 (dim=8) - 8 divides into 32 so we just split up the loop into two loops with extants 1 and 8. - we then keep the 1-loop in place and move the new 8-loop to back of the list of loops - ending loops / loop_extants = [3, 32, 6, 1, 8], remaining_factor_needed = 32 / 8 = 4 - E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed=32, index = 0 + E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed=32, index=0 (dim=3) - 3 does not divide 32, so we pad until the extant divides 32, e.g. 4 - we then split up the loop into extants 1 and 4, moving the 4 to the back - ending loops / loop_extants = [1, 32, 6, 8, 4], remaining_factor_needed = 32 / 4 = 8 - E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed=5, index = 3 + E.g. loops / loop_extants = [3, 32, 6, 8], factor_needed=5, index=3 (dim=8) - 8 is larger than 5 so we immediately do the splitting routine. - the 8 extant loop becomes loops with extants 2 and 5 - ending loops / loop_extants = [1, 32, 6, 2, 5], remaining_factor_needed = 5 / 5 = 1 @@ -163,10 +198,11 @@ def factor_dim_in_order( loops: List[LoopRV], cur_loop_extants: List[int], work_needed_inner_loop: int = tile_size, - ): + ) -> Tuple[List[LoopRV], Sequence[int]]: """Factors out the loops in the order of indices until we reach needed work. - Adds new loop factors to the back in reverse order of access. + Adds new loop factors to the back in reverse order of access. Returns new list + of loops and their extants. """ for i in indices: loops, cur_loop_extants, work_needed_inner_loop = spin_out_factor( @@ -176,83 +212,9 @@ def factor_dim_in_order( break return loops, cur_loop_extants - # Layout transform allows semantics like NCHW --> NCHW4c - # Which involves splitting the original C axis into contiguous 4-element chunks - # This axis is then moved to the end (NCHWc) - # To handle this we just split the associating axis (prev. type checking ensures C is divisible by 4) - # And then the layout is just NCcHW --> NCHW4c - # input_shape, src_layout, dst_layout = handle_split_case(sch) - # Note: NCHW4c --> NCHW is not allowed, so the only numeric digits will be in dst - def handle_block_implicit_reshape( - block_read, orig_input_shape, orig_src_layout, orig_dst_layout - ) -> Tuple[List[int], str, str]: - # Each loop should match src_layout dimension extants - loops = sch.get_loops(block_read) - - # Figure out split dimensions, entries are (loop index in src_layout, split dimension) - split_dimensions: List[Tuple[int, int]] = [] - - # This is without numeric digits, e.g. NCHW4c -> NCHWc - new_dst_layout = [] - - # Use state machine to parse NCHW4c string - split_size = 0 - for char in orig_dst_layout: - if char.isnumeric(): - split_size = split_size * 10 + int(char) - else: - if char.islower(): - # hit axis like 'c', need to find parent axis 'C' in src_layout - src_layout_index = orig_src_layout.index(char.upper()) - split_dimensions.append((src_layout_index, split_size)) - split_size = 0 - new_dst_layout.append(char) - - # Calculate final input shapes, each of these are a single element for unsplit dims - # and tuples for split dims associated with the two new axis - input_shape: List[Union[int, Tuple]] = [i for i in orig_input_shape] - new_src_layout: List[Union[str, Tuple]] = [c for c in orig_src_layout] - for src_layout_split_index, split_factor in split_dimensions: - dimension_name = new_src_layout[src_layout_split_index] - new_src_layout[src_layout_split_index] = (dimension_name, dimension_name.lower()) - - remain_factor = input_shape[src_layout_split_index] // split_factor - input_shape[src_layout_split_index] = (remain_factor, split_factor) - - sch.split(loops[src_layout_split_index], [remain_factor, split_factor]) - - # Finally to help analyzer, make layout match that of output - def index_map(*loop_indices): - answer = [] - assert len(loop_indices) == len(input_shape) - for dim, input_shape_solved in zip(loop_indices, input_shape): - if isinstance(input_shape_solved, tuple): - (_, split_factor) = input_shape_solved - answer.extend((dim // split_factor, dim % split_factor)) - else: - answer.append(dim) - return tuple(answer) - - sch.transform_layout( - block_read, buffer=("write", 0), index_map=index_map, assume_injective_transform=True - ) - - # Unpack any tuples introduced via appending - def unpack_list(target_list) -> list: - output = [] - for ele in target_list: - if isinstance(ele, tuple): - output.extend(ele) - else: - output.append(ele) - return output - - new_src_layout = unpack_list(new_src_layout) - new_src_layout = "".join(new_src_layout) - new_dst_layout = "".join(new_dst_layout) - return unpack_list(input_shape), new_src_layout, new_dst_layout - - def get_high_level_loop_structure(block_read, input_shape, src_layout, dst_layout): + def get_high_level_loop_structure( + block_read: BlockRV, input_shape: Sequence[int], src_layout: str, dst_layout: str + ): """Runs the factorization described above.""" # index 0 ... rank - 1 will always correspond to original loops # perhaps after they have been factored. @@ -270,7 +232,7 @@ def get_high_level_loop_structure(block_read, input_shape, src_layout, dst_layou # The factors which multiply to tile_size are now in back of our # list of loops. However because we added them by traversing the inner # dimensions, they are actually reversed order to guarantee the best access - # so reorder so reorder before fusing. + # so reorder before fusing. loops = loops[:rank] + loops[rank:][::-1] cur_loop_extants = cur_loop_extants[:rank] + cur_loop_extants[rank::-1] sch.reorder(*loops) @@ -300,25 +262,12 @@ def get_high_level_loop_structure(block_read, input_shape, src_layout, dst_layou cur_loop_extants = cur_loop_extants[: rank + 1] cur_loop_extants.append(tile_size) - # Outer loop structure of read block matches that of src_layout - # E.g. if input_shape is [4, 6, 8]. Loops for read block will be - # for i, j, k in T.grid(4, 6, 8): - # ... - # Read block will read from global memory coalesced at the start - # Assume write to output global memory is coalesced in block_write - block_read = sch.cache_read(block_write, 0, "shared") - - # Grab final input shape and src and dst layouts. - input_shape, src_layout, dst_layout = handle_block_implicit_reshape( - block_read, input_shape, src_layout, dst_layout - ) - - # After this we have [loop1, loop2, loop3 ... dim0_tiled, dim1_tiled] + # After this we have loops: [loop1, loop2, loop3 ... dim0_tiled, dim1_tiled] get_high_level_loop_structure(block_read, input_shape, src_layout, dst_layout) - loops = sch.get_loops(block_read) # If there are insufficient elements, than dim1_tiled or dim0_tiled might be too small # In all likelihood you should use a smaller tile, but I don't want things to crash. + loops = sch.get_loops(block_read) loops[-1] = pad_dimension_to_at_least_number(loops[-1], tile_size) loops[-2] = pad_dimension_to_at_least_number(loops[-2], tile_size) @@ -331,7 +280,7 @@ def get_high_level_loop_structure(block_read, input_shape, src_layout, dst_layou loops = loops[:-3] + [dim0_loop, loops[-3]] + loops[-2:] sch.reorder(*loops) - # After this: [outer_loop (block binding), dim0_tiled, dim1_tiled] + # After this loops are: [outer_loop (block binding), dim0_tiled, dim1_tiled] outer_loop = sch.fuse(*loops[:-2]) # Now that we have the high level loop structure, we can use reverse_compute_at magic @@ -350,9 +299,137 @@ def get_high_level_loop_structure(block_read, input_shape, src_layout, dst_layou sch.bind(loop=inner_read_loop, thread_axis="threadIdx.x") -def auto_inline(sch, start_block): - # Autoinlines given block into consumers, and repeats process for consumer of block - # Done by default for injective schedules. +def handle_block_implicit_reshape( + sch: tvm.tir.Schedule, + block_read: BlockRV, + orig_input_shape: Sequence[int], + orig_src_layout: str, + orig_dst_layout: str, +) -> Tuple[List[int], str, str]: + """ + Makes layout transform schedule applicable to implicit reshape case. + + Layout transform allows semantics like NCHW --> NCHW4c. Which involves splitting the original C axis into contiguous + 4-element chunks. This axis is then moved to the end (NCHWc). This is guaranteed by the operator to be done without + additional padding. To handle this we just split the associating axis (prev. type checking ensures C is divisible by 4) + in src_layout found in block_read. E.g. NCHW -> NCHW4c now becomes NC4cHW -> NCHW4c. + + Note: NCHW4c --> NCHW is not allowed, so the only numeric digits will be in dst. + + The returned layout strings will be santized and made compatible. E.g. NCHW --> NCHW4c becomes + NCcHW --> NCHWc. + + Parameters + ---------- + sch: + The initial schedule. We expect `block_read`. We also expect + block_read's initial loops to follow the original input shape. + + block_read: + The block which reads from global memory and writes to shared memory buffer. + + orig_input_shape: + The input shape of the input buffer to the primfunc. + + orig_src_layout: + The original src_layout string. + + orig_dst_layout: + The original dst_layout string. + + Returns + ------- + ret: + A tuple of the new input shape of shared memory buffer, the new src_layout and + new dst_layout string. + """ + # Each loop should match src_layout dimension extants + loops = sch.get_loops(block_read) + + # Figure out split dimensions, entries are (loop index in src_layout, split dimension) + split_dimensions: List[Tuple[int, int]] = [] + + # This is without numeric digits, e.g. NCHW4c -> NCHWc + new_dst_layout = [] + + # Use state machine to parse NCHW4c string + split_size = 0 + for char in orig_dst_layout: + if char.isnumeric(): + split_size = split_size * 10 + int(char) + else: + if char.islower(): + # hit axis like 'c', need to find parent axis 'C' in src_layout + src_layout_index = orig_src_layout.index(char.upper()) + split_dimensions.append((src_layout_index, split_size)) + split_size = 0 + new_dst_layout.append(char) + + # Calculate final input shapes, each of these are a single element for unsplit dims + # and tuples for split dims associated with the two new axis + input_shape: List[Union[int, Tuple]] = [i for i in orig_input_shape] + new_src_layout: List[Union[str, Tuple]] = [c for c in orig_src_layout] + for src_layout_split_index, split_factor in split_dimensions: + dimension_name = new_src_layout[src_layout_split_index] + new_src_layout[src_layout_split_index] = (dimension_name, dimension_name.lower()) + + remain_factor = input_shape[src_layout_split_index] // split_factor + input_shape[src_layout_split_index] = (remain_factor, split_factor) + + sch.split(loops[src_layout_split_index], [remain_factor, split_factor]) + + # Finally to help analyzer, make layout match that of output + def index_map(*loop_indices): + answer = [] + assert len(loop_indices) == len(input_shape) + for dim, input_shape_solved in zip(loop_indices, input_shape): + if isinstance(input_shape_solved, tuple): + (_, split_factor) = input_shape_solved + answer.extend((dim // split_factor, dim % split_factor)) + else: + answer.append(dim) + return tuple(answer) + + sch.transform_layout( + block_read, buffer=("write", 0), index_map=index_map, assume_injective_transform=True + ) + + # Unpack any tuples introduced via appending + def unpack_list(target_list) -> list: + output = [] + for ele in target_list: + if isinstance(ele, tuple): + output.extend(ele) + else: + output.append(ele) + return output + + new_src_layout = unpack_list(new_src_layout) + new_src_layout = "".join(new_src_layout) + new_dst_layout = "".join(new_dst_layout) + return unpack_list(input_shape), new_src_layout, new_dst_layout + + +def auto_inline(sch: tvm.tir.Schedule, start_block: BlockRV) -> BlockRV: + """ + Autoinlines given block into consumers, and repeats process for consumer of block. + + Done by default for injective schedules but must manually be called in new schedule rule. + + Parameters + ---------- + sch: + The initial schedule. + + start_block: + The block to inline, should be a block which reads and writes to global memory, doing + layout transform. + + Returns + ------- + ret: + The new block inlined into it's consumers. + """ fringe = deque([start_block]) visited = set() while len(fringe) > 0: @@ -371,47 +448,72 @@ def auto_inline(sch, start_block): return cur_block -def handle_split_case(sch, input_shape, src_layout, dst_layout): +@tvm.register_func("meta_schedule.cuda.layout_transform") +def cuda_layout_transform_schedule_rule( + sch: tvm.tir.Schedule, block: BlockRV +) -> List[tvm.tir.Schedule]: """ - Handle the case where the layout transform also reshapes some axis - e.g. NCHW --> NCHW4c - (split the original C axis into chunks of 4 contiguous elements, move this axis to end) + Applies tiling scheme to layout transform task (potentially fused with other injective funcs). - Rewrites the schedule to handle this case and returns the input shape as well as - src/dst layouts to use for the later code. - """ - pass + Returned schedules will be the default schedule, as well as tiled versions with tile_size in + the range of 2,3...32. + Parameters + ---------- + sch: + The initial schedule. -@tvm.register_func("meta_schedule.cuda.layout_transform") -def cuda_layout_transform_schedule_rule(sch, block): + block: + The block corresponding to the layout transform. + Should be a block which reads and writes to global memory, doing layout transform. + + Returns + ------- + ret: + A list of new schedules to try. + """ # params: input_buffer, output_buffer params = sch.mod["main"].params input_buffer = sch.mod["main"].buffer_map[params[0]] - output_buffer = sch.mod["main"].buffer_map[params[1]] # Info needed for tiling input_shape = [int(dim) for dim in input_buffer.shape] - output_shape = [int(dim) for dim in output_buffer.shape] src_layout = sch.get_sref(block).stmt.annotations["src_layout"] dst_layout = sch.get_sref(block).stmt.annotations["dst_layout"] - # For each schedule we also want to inline each stage as would be done in normal circumstances - # to prevent extraneous memory access. - block = auto_inline(sch, block) - schedules = [] # Always include the default schedules which will be handled via AutoBind schedule rule schedules.append(sch) - - # Setup up basic structure of schedule before applying tiling sch = sch.copy() + # For each schedule we also want to inline each stage as would be done in normal circumstances + # to prevent extraneous memory access. + block = auto_inline(sch, block) + + # Setup up basic structure of schedule of creating read into shared mem, before applying tiling + # Outer loop structure of read block matches that of src_layout + # E.g. if input_shape is [4, 6, 8]. Loops for read block will be + # for i, j, k in T.grid(4, 6, 8): + # ... + # Read block will read from global memory coalesced at the start + # Assume write to output global memory is coalesced in block_write + block_read = sch.cache_read(block, 0, "shared") + + # Handle the case where there is an implicit reshape going on. + # e.g. NCHW -> NCHW4c which is equivalent to reshaping NCHW + # to NCcHW and then applying the new layout where the extant of c is 4. + # Grab final input shape and src and dst layouts. + input_shape, src_layout, dst_layout = handle_block_implicit_reshape( + sch, block_read, input_shape, src_layout, dst_layout + ) + # Try tile size 2,3,4...32 as tile size of 1 has no coaslescing. for tile_size in range(2, 33): - cur_sch = sch.copy() - tile_layout_transform(cur_sch, block, src_layout, dst_layout, input_shape, tile_size) - schedules.append(cur_sch) + new_sch = sch.copy() + tile_layout_transform( + new_sch, block_read, block, src_layout, dst_layout, input_shape, tile_size + ) + schedules.append(new_sch) return schedules From f2f5165611ff1d85faac04e8138a568096da761b Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Mon, 13 Mar 2023 15:20:34 -0700 Subject: [PATCH 14/40] address comments --- .../schedule/cuda/layout_transform.py | 23 ++++++++++++++++--- .../postproc/rewrite_cooperative_fetch.cc | 2 +- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py index c3166abea85c..fe66e49752a5 100644 --- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py +++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py @@ -448,6 +448,19 @@ def auto_inline(sch: tvm.tir.Schedule, start_block: BlockRV) -> BlockRV: return cur_block +def get_max_tile_size() -> int: + """Returns the max tile size. + + This is assuming only threads in a warp can have coalesced accesses. 32 is the default if + no target information can be gotten. + """ + max_tile_size = 32 + cur_target = tvm.target.Target.current() + if cur_target is not None and hasattr(cur_target, "thread_warp_size"): + max_tile_size = int(cur_target.thread_warp_size) + return max_tile_size + + @tvm.register_func("meta_schedule.cuda.layout_transform") def cuda_layout_transform_schedule_rule( sch: tvm.tir.Schedule, block: BlockRV @@ -456,7 +469,10 @@ def cuda_layout_transform_schedule_rule( Applies tiling scheme to layout transform task (potentially fused with other injective funcs). Returned schedules will be the default schedule, as well as tiled versions with tile_size in - the range of 2,3...32. + the range of 2,3...threads_per_warp. + + This is assuming only threads in a warp can have coalesced accesses. 32 is the default if + no target information can be gotten. Parameters ---------- @@ -508,8 +524,9 @@ def cuda_layout_transform_schedule_rule( sch, block_read, input_shape, src_layout, dst_layout ) - # Try tile size 2,3,4...32 as tile size of 1 has no coaslescing. - for tile_size in range(2, 33): + # Try tile size 2,3...threads_per_warp as tile size of 1 has no coaslescing. + max_tile_size = get_max_tile_size() + for tile_size in range(2, max_tile_size + 1): new_sch = sch.copy() tile_layout_transform( new_sch, block_read, block, src_layout, dst_layout, input_shape, tile_size diff --git a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc index 7e314a50a97b..1cf0d893d3e6 100644 --- a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc +++ b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc @@ -47,7 +47,7 @@ Optional ParseThreadBinding(const Schedule& sch, const Instruction& ins // and then later modify the schedule. As the passed in schedule is after running the entire trace // the bound loop may be moved around in the IRModule. // TODO: apply trace one inst at a time so schedule state is always accurate to instruction - LOG(WARNING) << "Failed to calculate extent so skipping RewriteCooperativeFetching. Error " << e.what(); + LOG(DEBUG) << "Failed to calculate extent so skipping RewriteCooperativeFetching. Error " << e.what(); return NullOpt; } } From 1fb271b5fb5b8ff739db7aa6ac45bcbcf30541dd Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Mon, 13 Mar 2023 17:08:45 -0700 Subject: [PATCH 15/40] testing harness --- .../meta_schedule/schedule/cuda/layout_transform.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py index fe66e49752a5..829bbff8dfeb 100644 --- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py +++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py @@ -1,6 +1,6 @@ import math from collections import deque -from typing import List, Sequence, Tuple, Union +from typing import List, Optional, Sequence, Tuple, Union import tvm from tvm import topi @@ -463,7 +463,7 @@ def get_max_tile_size() -> int: @tvm.register_func("meta_schedule.cuda.layout_transform") def cuda_layout_transform_schedule_rule( - sch: tvm.tir.Schedule, block: BlockRV + sch: tvm.tir.Schedule, block: BlockRV, tile_sizes: Optional[List[int]] = None ) -> List[tvm.tir.Schedule]: """ Applies tiling scheme to layout transform task (potentially fused with other injective funcs). @@ -483,6 +483,9 @@ def cuda_layout_transform_schedule_rule( The block corresponding to the layout transform. Should be a block which reads and writes to global memory, doing layout transform. + tile_sizes: + A list of tile sizes to try, overriding normal settings. For testing + Returns ------- ret: @@ -526,7 +529,9 @@ def cuda_layout_transform_schedule_rule( # Try tile size 2,3...threads_per_warp as tile size of 1 has no coaslescing. max_tile_size = get_max_tile_size() - for tile_size in range(2, max_tile_size + 1): + if tile_sizes is None: + tile_sizes = range(2, max_tile_size + 1) + for tile_size in tile_sizes: new_sch = sch.copy() tile_layout_transform( new_sch, block_read, block, src_layout, dst_layout, input_shape, tile_size From 31ca25a87ac49c9b5ad4c287e6d715c737dab5a4 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Mon, 13 Mar 2023 17:31:20 -0700 Subject: [PATCH 16/40] more progress on testing harness --- ...schedule_schedule_cuda_layout_transform.py | 179 ++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py new file mode 100644 index 000000000000..d842fd4aabff --- /dev/null +++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py @@ -0,0 +1,179 @@ +# Edge Cases: +# 1. Fusion with ops +# 2. Fusion with ops + +# Properties to test for +# 1. Compiling -- compiles well without crashing +# 2. Correctness when running +# 3. Autotuning ability + + +import itertools +import random +from typing import List, Optional, Tuple, Union + +import tvm +from tvm import meta_schedule, relay +from tvm.meta_schedule.schedule.cuda.layout_transform import cuda_layout_transform_schedule_rule +from tvm.relay.op import OpPattern + + +# Create unary functions which apply ops with compatible fusion levels +def get_random_axis(data: relay.Expr): + rank = len(relay.transform.InferTypeLocal(data).shape) + return random.randint(0, rank - 1) + + +def apply_elemwise_clip(data: relay.Expr, min=0, max=10): + assert relay.op.get("clip").get_attr("TOpPattern") == OpPattern.ELEMWISE + return relay.clip(data, min, max) + + +def apply_broadcast_add(data: relay.Expr, val_to_add=5): + assert relay.op.get("add").get_attr("TOpPattern") == OpPattern.BROADCAST + type_info = relay.transform.InferTypeLocal(data) + return relay.add(data, relay.const(val_to_add, dtype=type_info.dtype)) + + +def apply_injective_concatenate(data: relay.Expr, axis=None): + if axis is None: + axis = get_random_axis(data) + assert relay.op.get("concatenate").get_attr("TOpPattern") == OpPattern.INJECTIVE + return relay.concatenate([data, data], axis) + + +def apply_comm_reduce_max(data: relay.Expr, axis=None): + if axis is None: + axis = get_random_axis(data) + assert relay.op.get("max").get_attr("TOpPattern") == OpPattern.COMM_REDUCE + + # Do this to maintain dimensions + return relay.add(data, relay.max(data, axis, keepdims=True)) + + +# Applying the actual layout transform will be different +def apply_layout_transform(data: relay.Expr, src_layout: str, dst_layout: str): + assert relay.op.get("layout_transform").get_attr("TOpPattern") == OpPattern.INJECTIVE + return relay.layout_transform(data, src_layout, dst_layout) + + +# These are the only levels of op which can possibly be fused with layout_transform (which injective) +extra_pattern_level_to_op = { + OpPattern.ELEMWISE: apply_elemwise_clip, + OpPattern.BROADCAST: apply_broadcast_add, + OpPattern.INJECTIVE: apply_injective_concatenate, + OpPattern.COMM_REDUCE: apply_comm_reduce_max, +} + + +def create_relay_module( + input_shape: List[int], dtype: str, ops: List[Union[int, Tuple[str, str]]] +) -> tvm.IRModule: + """Create a relay module with the given string of ops. + + ops: + Applies the associated operators in order. If an integer, refers to applying + the unary operator from `extra_pattern_level_to_op` map. If a tuple, applies + a layout transform with the given (src_layout, dst_layout) + """ + input_data = relay.var("input", shape=input_shape, dtype=dtype) + + cur_data = input_data + for op_info in ops: + # Progressively build type info + relay.transform.InferTypeLocal(cur_data) + if isinstance(op_info, tuple): + # layout transform case + src_layout, dst_layout = op_info + cur_data = apply_layout_transform(cur_data, src_layout, dst_layout) + else: + cur_data = extra_pattern_level_to_op[op_info](cur_data) + + relay.transform.InferTypeLocal(cur_data) + return tvm.IRModule.from_expr(cur_data) + + +def generate_test_case( + input_shape: List[int], + implicit_reshape_info: Optional[Tuple[int, int]], + dtype: str, + num_additional_ops: int, +): + # Create layout transforms + rank = len(input_shape) + src_layout = "".join([chr(i + ord("A")) for i in range(rank)]) + + dst_layout = list(src_layout) + if implicit_reshape_info: + axis_to_reshape, size_new_dim = implicit_reshape_info + cur_dim = dst_layout[axis_to_reshape] + dst_layout[axis_to_reshape] = f"{cur_dim}{size_new_dim}{cur_dim.lower()}" + random.shuffle(dst_layout) + dst_layout = "".join(dst_layout) + + op_choices = random.choices(list(extra_pattern_level_to_op.keys()), k=num_additional_ops) + op_choices.append((src_layout, dst_layout)) + + random.shuffle(op_choices) + return create_relay_module(input_shape, dtype, op_choices) + + +def verify_schedule(sch: tvm.tir.Schedule, tile_sizes: List[int]): + block_layout_transform = sch.get_block("T_layout_trans") + schedules = cuda_layout_transform_schedule_rule(sch, block_layout_transform, tile_sizes) + + assert len(schedules) == len(tile_sizes) + 1 + + # This is the default schedule which does not apply the schedule rule + schedule_baseline = schedules[0] + + # These are the tiled schedules we want to test + schedule_end = schedules[1:] + # TODO + + +def generate_all_test_case( + # Each has ~10k elements + input_shapes: List[List[int]] = [ + [12, 48, 18], + [890, 14], + [10, 12, 2, 5, 3, 3], + ], + implicit_reshape_conditions: List[Optional[Tuple[int, int]]] = [None, (0, 2), (1, 2)], + dtypes: List[str] = ["float32", "float16"], + num_additional_ops: int = 5, + tile_sizes: List[int] = [32, 20, 19], + repeats_per_condition=10, +): + for _ in range(repeats_per_condition): + for input_shape, implicit_reshape_info, dtype in itertools.product( + input_shapes, implicit_reshape_conditions, dtypes + ): + # Generate random module of fusable ops + layout transform and extract fused layout transform task + mod = generate_test_case(input_shape, implicit_reshape_info, dtype, num_additional_ops) + extracted_tasks = meta_schedule.relay_integration.extract_tasks( + mod, + tvm.target.Target("cuda"), + {}, + pass_config={ + "relay.backend.use_meta_schedule": True, + "relay.FuseOps.max_depth": 30, + "relay.backend.tir_converter": "default", + }, + ) + task_of_interest = None + for task in extracted_tasks: + if "layout_transform" in task.task_name: + task_of_interest = task + break + assert task_of_interest is not None + + # Fused layout transform task + dispatched_mod = task_of_interest.dispatched[0] + base_schedule = tvm.tir.Schedule(dispatched_mod) + print(mod) + verify_schedule(base_schedule, tile_sizes) + + +if __name__ == "__main__": + generate_all_test_case() From 585191b20c38d5baf6bfac69ba3c72c167714edd Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Mon, 13 Mar 2023 18:03:08 -0700 Subject: [PATCH 17/40] fix case where shape changes in mod --- include/tvm/topi/transform.h | 3 +- .../schedule/cuda/layout_transform.py | 9 ++---- ...schedule_schedule_cuda_layout_transform.py | 30 ++++++++++++++++++- 3 files changed, 33 insertions(+), 9 deletions(-) diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h index ff57f99075ff..aa75fb05a067 100644 --- a/include/tvm/topi/transform.h +++ b/include/tvm/topi/transform.h @@ -1619,7 +1619,8 @@ inline Tensor layout_transform(const Tensor& src, const std::string& src_layout, Map attrs = {{"schedule_rule", String(schedule_rule)}, // Information about layouts needed for the schedule rule {"src_layout", String(src_layout)}, - {"dst_layout", String(dst_layout)}}; + {"dst_layout", String(dst_layout)}, + {"input_shape", src->shape}}; return compute( dst_shape, diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py index 829bbff8dfeb..4c9fee2fb921 100644 --- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py +++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py @@ -187,7 +187,7 @@ def spin_out_factor( loops[index] = new_loop_split loops.append(factored_loop) - loop_extants[index] = math.ceil(new_size / split_factor) + loop_extants[index] = math.ceil(int(new_size) / int(split_factor)) loop_extants.append(split_factor) sch.reorder(*loops) @@ -491,14 +491,10 @@ def cuda_layout_transform_schedule_rule( ret: A list of new schedules to try. """ - # params: input_buffer, output_buffer - params = sch.mod["main"].params - input_buffer = sch.mod["main"].buffer_map[params[0]] - # Info needed for tiling - input_shape = [int(dim) for dim in input_buffer.shape] src_layout = sch.get_sref(block).stmt.annotations["src_layout"] dst_layout = sch.get_sref(block).stmt.annotations["dst_layout"] + input_shape = [int(c) for c in sch.get_sref(block).stmt.annotations["input_shape"]] schedules = [] @@ -526,7 +522,6 @@ def cuda_layout_transform_schedule_rule( input_shape, src_layout, dst_layout = handle_block_implicit_reshape( sch, block_read, input_shape, src_layout, dst_layout ) - # Try tile size 2,3...threads_per_warp as tile size of 1 has no coaslescing. max_tile_size = get_max_tile_size() if tile_sizes is None: diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py index d842fd4aabff..0251a1d1f651 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py +++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py @@ -108,7 +108,10 @@ def generate_test_case( axis_to_reshape, size_new_dim = implicit_reshape_info cur_dim = dst_layout[axis_to_reshape] dst_layout[axis_to_reshape] = f"{cur_dim}{size_new_dim}{cur_dim.lower()}" + random.shuffle(dst_layout) + while "".join(dst_layout) == src_layout: + random.shuffle(dst_layout) dst_layout = "".join(dst_layout) op_choices = random.choices(list(extra_pattern_level_to_op.keys()), k=num_additional_ops) @@ -141,7 +144,7 @@ def generate_all_test_case( ], implicit_reshape_conditions: List[Optional[Tuple[int, int]]] = [None, (0, 2), (1, 2)], dtypes: List[str] = ["float32", "float16"], - num_additional_ops: int = 5, + num_additional_ops: int = 1, tile_sizes: List[int] = [32, 20, 19], repeats_per_condition=10, ): @@ -176,4 +179,29 @@ def generate_all_test_case( if __name__ == "__main__": + mod = create_relay_module([890, 14], "float32", [("AB", "BA"), 2]) + extracted_tasks = meta_schedule.relay_integration.extract_tasks( + mod, + tvm.target.Target("cuda"), + {}, + pass_config={ + "relay.backend.use_meta_schedule": True, + "relay.FuseOps.max_depth": 30, + "relay.backend.tir_converter": "default", + }, + ) + task_of_interest = None + for task in extracted_tasks: + if "layout_transform" in task.task_name: + task_of_interest = task + break + assert task_of_interest is not None + + # # Fused layout transform task + # dispatched_mod = task_of_interest.dispatched[0] + # base_schedule = tvm.tir.Schedule(dispatched_mod) + # verify_schedule(base_schedule, [32, 20, 19]) + + breakpoint() + generate_all_test_case() From 9822f4ce13a072513782a485e667196957158d4c Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Mon, 13 Mar 2023 18:15:48 -0700 Subject: [PATCH 18/40] inline after schedule genreation to help analysis --- .../schedule/cuda/layout_transform.py | 13 +++--- ...schedule_schedule_cuda_layout_transform.py | 42 +++++++++---------- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py index 4c9fee2fb921..e19c888fc387 100644 --- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py +++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py @@ -73,7 +73,7 @@ def tile_layout_transform( dst_layout: str, input_shape: List[int], tile_size: ExprRV, -): +) -> Tuple[BlockRV, BlockRV]: """ High level tiling for layout transform block. Mutates sch in place. @@ -298,6 +298,8 @@ def get_high_level_loop_structure( sch.bind(loop=inner_write_loop, thread_axis="threadIdx.x") sch.bind(loop=inner_read_loop, thread_axis="threadIdx.x") + return block_write, block_read + def handle_block_implicit_reshape( sch: tvm.tir.Schedule, @@ -502,10 +504,6 @@ def cuda_layout_transform_schedule_rule( schedules.append(sch) sch = sch.copy() - # For each schedule we also want to inline each stage as would be done in normal circumstances - # to prevent extraneous memory access. - block = auto_inline(sch, block) - # Setup up basic structure of schedule of creating read into shared mem, before applying tiling # Outer loop structure of read block matches that of src_layout # E.g. if input_shape is [4, 6, 8]. Loops for read block will be @@ -528,9 +526,12 @@ def cuda_layout_transform_schedule_rule( tile_sizes = range(2, max_tile_size + 1) for tile_size in tile_sizes: new_sch = sch.copy() - tile_layout_transform( + block_write, block_read = tile_layout_transform( new_sch, block_read, block, src_layout, dst_layout, input_shape, tile_size ) + # For each schedule we also want to inline each stage as would be done in normal circumstances + # to prevent extraneous memory access. + auto_inline(new_sch, block_write) schedules.append(new_sch) return schedules diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py index 0251a1d1f651..281fabbeb5e2 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py +++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py @@ -179,29 +179,29 @@ def generate_all_test_case( if __name__ == "__main__": - mod = create_relay_module([890, 14], "float32", [("AB", "BA"), 2]) - extracted_tasks = meta_schedule.relay_integration.extract_tasks( - mod, - tvm.target.Target("cuda"), - {}, - pass_config={ - "relay.backend.use_meta_schedule": True, - "relay.FuseOps.max_depth": 30, - "relay.backend.tir_converter": "default", - }, - ) - task_of_interest = None - for task in extracted_tasks: - if "layout_transform" in task.task_name: - task_of_interest = task - break - assert task_of_interest is not None - - # # Fused layout transform task + # mod = create_relay_module([12, 48, 18], "float32", [("ABC", "B2bAC"), 2]) + # extracted_tasks = meta_schedule.relay_integration.extract_tasks( + # mod, + # tvm.target.Target("cuda"), + # {}, + # pass_config={ + # "relay.backend.use_meta_schedule": True, + # "relay.FuseOps.max_depth": 30, + # "relay.backend.tir_converter": "default", + # }, + # ) + # task_of_interest = None + # for task in extracted_tasks: + # if "layout_transform" in task.task_name: + # task_of_interest = task + # break + # assert task_of_interest is not None + + # # # Fused layout transform task # dispatched_mod = task_of_interest.dispatched[0] # base_schedule = tvm.tir.Schedule(dispatched_mod) - # verify_schedule(base_schedule, [32, 20, 19]) + # verify_schedule(base_schedule, [30, 20, 19]) - breakpoint() + # exit() generate_all_test_case() From e152e394c58d891e46f777ae5623d296b3e6f713 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Tue, 14 Mar 2023 13:44:45 -0700 Subject: [PATCH 19/40] proper autoinlining INTO layout transform block to maintain extants --- .../schedule/cuda/layout_transform.py | 41 +++++++++++-------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py index e19c888fc387..50388b34bca3 100644 --- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py +++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py @@ -3,7 +3,7 @@ from typing import List, Optional, Sequence, Tuple, Union import tvm -from tvm import topi +from tvm import meta_schedule, topi from tvm.tir.schedule import BlockRV, ExprRV, LoopRV ## Tiling layout transforms: @@ -412,11 +412,9 @@ def unpack_list(target_list) -> list: return unpack_list(input_shape), new_src_layout, new_dst_layout -def auto_inline(sch: tvm.tir.Schedule, start_block: BlockRV) -> BlockRV: +def auto_inline_into(sch: tvm.tir.Schedule, start_block: BlockRV) -> BlockRV: """ - Autoinlines given block into consumers, and repeats process for consumer of block. - - Done by default for injective schedules but must manually be called in new schedule rule. + Inlines given start_block's consumers and future dependencies into start_block. Parameters ---------- @@ -424,7 +422,7 @@ def auto_inline(sch: tvm.tir.Schedule, start_block: BlockRV) -> BlockRV: The initial schedule. start_block: - The block to inline, should be a block which reads and writes to global memory, doing + The block to inline into, should be a block which reads and writes to global memory, doing layout transform. Returns @@ -432,7 +430,7 @@ def auto_inline(sch: tvm.tir.Schedule, start_block: BlockRV) -> BlockRV: ret: The new block inlined into it's consumers. """ - fringe = deque([start_block]) + fringe = deque(sch.get_consumers(start_block)) visited = set() while len(fringe) > 0: cur_block = fringe.popleft() @@ -442,12 +440,17 @@ def auto_inline(sch: tvm.tir.Schedule, start_block: BlockRV) -> BlockRV: visited.add(cur_block) consumer_blocks = sch.get_consumers(cur_block) - if len(consumer_blocks) >= 1: - fringe.extend(consumer_blocks) - sch.compute_inline(cur_block) - else: - # Found output block, no more inlining needed - return cur_block + fringe.extend(consumer_blocks) + + autoinline_rule = meta_schedule.schedule_rule.AutoInline( + into_producer=True, + into_consumer=False, + inline_const_tensor=True, + disallow_if_then_else=False, + require_injective=False, + require_ordered=False, + ) + sch = autoinline_rule.apply(sch, cur_block)[0] def get_max_tile_size() -> int: @@ -504,6 +507,12 @@ def cuda_layout_transform_schedule_rule( schedules.append(sch) sch = sch.copy() + # Inline consumers of the layout transform into the layout transform block. + # Normally default for injective schedules but must manually be called in new schedule rule + # as we introduce a new block under the custom schedule rule which is not taken into account + # during search space generation. TODO: rectify this. + auto_inline_into(sch, block) + # Setup up basic structure of schedule of creating read into shared mem, before applying tiling # Outer loop structure of read block matches that of src_layout # E.g. if input_shape is [4, 6, 8]. Loops for read block will be @@ -520,18 +529,16 @@ def cuda_layout_transform_schedule_rule( input_shape, src_layout, dst_layout = handle_block_implicit_reshape( sch, block_read, input_shape, src_layout, dst_layout ) + # Try tile size 2,3...threads_per_warp as tile size of 1 has no coaslescing. max_tile_size = get_max_tile_size() if tile_sizes is None: tile_sizes = range(2, max_tile_size + 1) for tile_size in tile_sizes: new_sch = sch.copy() - block_write, block_read = tile_layout_transform( + tile_layout_transform( new_sch, block_read, block, src_layout, dst_layout, input_shape, tile_size ) - # For each schedule we also want to inline each stage as would be done in normal circumstances - # to prevent extraneous memory access. - auto_inline(new_sch, block_write) schedules.append(new_sch) return schedules From 8274d21ba07a0858f15e2eab0615862aac852e16 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Tue, 14 Mar 2023 15:32:02 -0700 Subject: [PATCH 20/40] clean up --- .../schedule/cuda/layout_transform.py | 18 +- ...schedule_schedule_cuda_layout_transform.py | 165 ++++++++++++++---- 2 files changed, 144 insertions(+), 39 deletions(-) diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py index 50388b34bca3..5a3637af94cf 100644 --- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py +++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py @@ -468,7 +468,7 @@ def get_max_tile_size() -> int: @tvm.register_func("meta_schedule.cuda.layout_transform") def cuda_layout_transform_schedule_rule( - sch: tvm.tir.Schedule, block: BlockRV, tile_sizes: Optional[List[int]] = None + sch: tvm.tir.Schedule, block: BlockRV, testing_tile_sizes: Optional[List[int]] = None ) -> List[tvm.tir.Schedule]: """ Applies tiling scheme to layout transform task (potentially fused with other injective funcs). @@ -488,8 +488,9 @@ def cuda_layout_transform_schedule_rule( The block corresponding to the layout transform. Should be a block which reads and writes to global memory, doing layout transform. - tile_sizes: - A list of tile sizes to try, overriding normal settings. For testing + testing_tile_sizes: + A list of tile sizes to try, overriding normal settings. For testing. None means + ignore. Else overrides normal settings of tile sizes to try. Returns ------- @@ -504,7 +505,8 @@ def cuda_layout_transform_schedule_rule( schedules = [] # Always include the default schedules which will be handled via AutoBind schedule rule - schedules.append(sch) + if not testing_tile_sizes: + schedules.append(sch) sch = sch.copy() # Inline consumers of the layout transform into the layout transform block. @@ -531,9 +533,11 @@ def cuda_layout_transform_schedule_rule( ) # Try tile size 2,3...threads_per_warp as tile size of 1 has no coaslescing. - max_tile_size = get_max_tile_size() - if tile_sizes is None: - tile_sizes = range(2, max_tile_size + 1) + if testing_tile_sizes is None: + tile_sizes = range(2, get_max_tile_size() + 1) + else: + tile_sizes = testing_tile_sizes + for tile_size in tile_sizes: new_sch = sch.copy() tile_layout_transform( diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py index 281fabbeb5e2..fb4452bc8637 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py +++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py @@ -12,10 +12,13 @@ import random from typing import List, Optional, Tuple, Union +import numpy as np + import tvm from tvm import meta_schedule, relay from tvm.meta_schedule.schedule.cuda.layout_transform import cuda_layout_transform_schedule_rule from tvm.relay.op import OpPattern +from tvm.tir.schedule import BlockRV, ExprRV, LoopRV # Create unary functions which apply ops with compatible fusion levels @@ -66,6 +69,29 @@ def apply_layout_transform(data: relay.Expr, src_layout: str, dst_layout: str): } +class PatchCustomLayoutTransformScheduleRule: + """Patch the custom layout transform schedule to test only specific tile sizes.""" + + FUNC_NAME = "meta_schedule.cuda.layout_transform" + + def __init__(self, tile_sizes: List[int]) -> None: + self.tile_sizes = tile_sizes + self.old_func = None + + def __enter__(self, *args, **kwargs) -> None: + self.old_func = tvm.get_global_func(self.FUNC_NAME) + + def new_layout_rule( + sch: tvm.tir.Schedule, block: BlockRV, tile_sizes: Optional[List[int]] = self.tile_sizes + ) -> List[tvm.tir.Schedule]: + return cuda_layout_transform_schedule_rule(sch, block, tile_sizes) + + tvm.register_func(self.FUNC_NAME, new_layout_rule, override=True) + + def __exit__(self, *args, **kwargs) -> None: + tvm.register_func(self.FUNC_NAME, self.old_func, override=True) + + def create_relay_module( input_shape: List[int], dtype: str, ops: List[Union[int, Tuple[str, str]]] ) -> tvm.IRModule: @@ -98,7 +124,7 @@ def generate_test_case( implicit_reshape_info: Optional[Tuple[int, int]], dtype: str, num_additional_ops: int, -): +) -> tvm.IRModule: # Create layout transforms rank = len(input_shape) src_layout = "".join([chr(i + ord("A")) for i in range(rank)]) @@ -121,18 +147,101 @@ def generate_test_case( return create_relay_module(input_shape, dtype, op_choices) -def verify_schedule(sch: tvm.tir.Schedule, tile_sizes: List[int]): - block_layout_transform = sch.get_block("T_layout_trans") - schedules = cuda_layout_transform_schedule_rule(sch, block_layout_transform, tile_sizes) - - assert len(schedules) == len(tile_sizes) + 1 +def extract_layout_transform_task( + mod: tvm.IRModule, target: tvm.target.Target +) -> Tuple[tvm.IRModule, tvm.IRModule]: + """Given a relay IRModule, return the PrimFunc IRModule with fused layout transform task.""" + extracted_tasks = meta_schedule.relay_integration.extract_tasks( + mod, + target, + {}, + pass_config={ + "relay.backend.use_meta_schedule": True, + "relay.FuseOps.max_depth": 30, + "relay.backend.tir_converter": "default", + }, + ) + task_of_interest = None + for task in extracted_tasks: + if "layout_transform" in task.task_name: + task_of_interest = task + break + assert task_of_interest is not None + + # Fused layout transform task + relay_mod = task_of_interest.mod + dispatched_mod = task_of_interest.dispatched[0] + return relay_mod, dispatched_mod + + +def run_primfunc( + primfunc_mod: tvm.IRModule, target: tvm.target.Target, input_tensors: List[tvm.nd.NDArray] +): + with tvm.transform.PassContext( + config={ + "relay.backend.use_meta_schedule": True, + "relay.backend.use_meta_schedule_dispatch": False, + "relay.FuseOps.max_depth": 30, + }, + opt_level=3, + ): + lib = tvm.build(primfunc_mod, target=target) + lib(*input_tensors) + + +def verify_layout_transform_task( + relay_mod: tvm.IRModule, + dispatched_mod: tvm.IRModule, + target: tvm.target.Target, + tile_sizes: List[int], +): + """Given a layout transform primfunc, tests the given tile_sizes and verifies output matches.""" + space_generator = meta_schedule.space_generator.PostOrderApply( + sch_rules=meta_schedule.schedule_rule.schedule_rule.create("cuda"), + postprocs=meta_schedule.postproc.postproc.create("cuda"), + mutator_probs=meta_schedule.mutator.mutator.create("cuda"), + ) + device = tvm.cuda(0) + + func_type = relay.transform.InferTypeLocal(relay_mod[relay_mod.get_global_vars()[0]]) + input_tensors = [] + for input_type in func_type.arg_types: + orig_input_np = np.random.uniform(0, 10, size=list(map(int, input_type.shape))).astype( + input_type.dtype + ) + input_tensors.append(tvm.nd.array(orig_input_np, device)) + ret_type = func_type.ret_type + + def get_output_tensor() -> Tuple[tvm.nd.NDArray, tvm.nd.NDArray]: + numpy_init = np.random.uniform(0, 1000, size=list(map(int, ret_type.shape))).astype( + ret_type.dtype + ) + return tvm.nd.array(numpy_init, device) + + def run_and_get_output(tile_size: Optional[int]) -> np.ndarray: + # By setting the tile_sizes to search to nothing, the layout transform rule just returns + # the original schedule. + tile_size_input = [] if tile_size is None else [tile_size] + with PatchCustomLayoutTransformScheduleRule(tile_sizes=tile_size_input): + tune_context = meta_schedule.TuneContext( + mod=dispatched_mod, + target=target, + space_generator=space_generator, + search_strategy=meta_schedule.search_strategy.create(), + ) + tune_context.pre_tuning(32) + returned_primfunc = tune_context.generate_measure_candidates()[0].sch.mod + output_tensor = get_output_tensor() + run_primfunc(returned_primfunc, target, [*input_tensors, output_tensor]) + # print(returned_primfunc) + return output_tensor.numpy() - # This is the default schedule which does not apply the schedule rule - schedule_baseline = schedules[0] + # Passing None, we basically do not apply the custom rule we have created. + ground_truth_np = run_and_get_output(None) + for tile_size in tile_sizes: + experimental_result_np = run_and_get_output(tile_size) - # These are the tiled schedules we want to test - schedule_end = schedules[1:] - # TODO + np.testing.assert_allclose(ground_truth_np, experimental_result_np) def generate_all_test_case( @@ -144,38 +253,30 @@ def generate_all_test_case( ], implicit_reshape_conditions: List[Optional[Tuple[int, int]]] = [None, (0, 2), (1, 2)], dtypes: List[str] = ["float32", "float16"], - num_additional_ops: int = 1, + num_additional_ops: int = 0, tile_sizes: List[int] = [32, 20, 19], repeats_per_condition=10, ): + # Small numbers which should work for nearly every (modern-ish) gpu. + target = tvm.target.Target( + "cuda -max_threads_per_block=32 -max_num_threads=128 -thread_warp_size=32 -max_shared_memory_per_block=8192 -registers_per_block=1024" + ) for _ in range(repeats_per_condition): for input_shape, implicit_reshape_info, dtype in itertools.product( input_shapes, implicit_reshape_conditions, dtypes ): # Generate random module of fusable ops + layout transform and extract fused layout transform task - mod = generate_test_case(input_shape, implicit_reshape_info, dtype, num_additional_ops) - extracted_tasks = meta_schedule.relay_integration.extract_tasks( - mod, - tvm.target.Target("cuda"), - {}, - pass_config={ - "relay.backend.use_meta_schedule": True, - "relay.FuseOps.max_depth": 30, - "relay.backend.tir_converter": "default", - }, + full_mod = generate_test_case( + input_shape, implicit_reshape_info, dtype, num_additional_ops ) - task_of_interest = None - for task in extracted_tasks: - if "layout_transform" in task.task_name: - task_of_interest = task - break - assert task_of_interest is not None # Fused layout transform task - dispatched_mod = task_of_interest.dispatched[0] - base_schedule = tvm.tir.Schedule(dispatched_mod) - print(mod) - verify_schedule(base_schedule, tile_sizes) + relay_mod, dispatched_mod = extract_layout_transform_task(full_mod, target) + + print(relay_mod) + verify_layout_transform_task(relay_mod, dispatched_mod, target, tile_sizes) + print("Verified!") + print() if __name__ == "__main__": From 1c7aa1941d6cd0b96a4783c1ba4389c8872d83b1 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Wed, 15 Mar 2023 13:03:05 -0700 Subject: [PATCH 21/40] reindex for introducing cache block --- .../schedule/cuda/layout_transform.py | 65 ++++++++++--------- .../tvm/meta_schedule/schedule/cuda/test.py | 25 +++++++ 2 files changed, 59 insertions(+), 31 deletions(-) create mode 100644 python/tvm/meta_schedule/schedule/cuda/test.py diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py index 5a3637af94cf..aa2378443602 100644 --- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py +++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py @@ -301,9 +301,9 @@ def get_high_level_loop_structure( return block_write, block_read -def handle_block_implicit_reshape( +def create_cached_read( sch: tvm.tir.Schedule, - block_read: BlockRV, + block_write: BlockRV, orig_input_shape: Sequence[int], orig_src_layout: str, orig_dst_layout: str, @@ -345,10 +345,7 @@ def handle_block_implicit_reshape( A tuple of the new input shape of shared memory buffer, the new src_layout and new dst_layout string. """ - # Each loop should match src_layout dimension extants - loops = sch.get_loops(block_read) - - # Figure out split dimensions, entries are (loop index in src_layout, split dimension) + # Figure out split dimensions, entries are (loop index in src_layout, split amount) split_dimensions: List[Tuple[int, int]] = [] # This is without numeric digits, e.g. NCHW4c -> NCHWc @@ -367,6 +364,11 @@ def handle_block_implicit_reshape( split_size = 0 new_dst_layout.append(char) + # If no splits were detected we are done + if len(split_dimensions) == 0: + block_read = sch.cache_read(block_write, 0, "shared") + return block_read, orig_input_shape, orig_src_layout, orig_dst_layout + # Calculate final input shapes, each of these are a single element for unsplit dims # and tuples for split dims associated with the two new axis input_shape: List[Union[int, Tuple]] = [i for i in orig_input_shape] @@ -374,27 +376,10 @@ def handle_block_implicit_reshape( for src_layout_split_index, split_factor in split_dimensions: dimension_name = new_src_layout[src_layout_split_index] new_src_layout[src_layout_split_index] = (dimension_name, dimension_name.lower()) - - remain_factor = input_shape[src_layout_split_index] // split_factor - input_shape[src_layout_split_index] = (remain_factor, split_factor) - - sch.split(loops[src_layout_split_index], [remain_factor, split_factor]) - - # Finally to help analyzer, make layout match that of output - def index_map(*loop_indices): - answer = [] - assert len(loop_indices) == len(input_shape) - for dim, input_shape_solved in zip(loop_indices, input_shape): - if isinstance(input_shape_solved, tuple): - (_, split_factor) = input_shape_solved - answer.extend((dim // split_factor, dim % split_factor)) - else: - answer.append(dim) - return tuple(answer) - - sch.transform_layout( - block_read, buffer=("write", 0), index_map=index_map, assume_injective_transform=True - ) + input_shape[src_layout_split_index] = ( + input_shape[src_layout_split_index] // split_factor, + split_factor, + ) # Unpack any tuples introduced via appending def unpack_list(target_list) -> list: @@ -409,7 +394,25 @@ def unpack_list(target_list) -> list: new_src_layout = unpack_list(new_src_layout) new_src_layout = "".join(new_src_layout) new_dst_layout = "".join(new_dst_layout) - return unpack_list(input_shape), new_src_layout, new_dst_layout + + # Write block loop extants match + reindex_map = [new_src_layout.index(dim) for dim in new_dst_layout] + block_read = sch.reindex_cache_read( + block_write, + read_buffer_index=0, + index_map=tvm.tir.IndexMap.from_func( + lambda *loops: [loops[reindex_map[i]] for i, _ in enumerate(loops)], + ndim=len(new_src_layout), + ), + storage_scope="shared", + ) + + # While the above will have the shared memory buffer match the reshaped input tensor + # the loops still match those of the write/output loop/buffer. Match the src layout instead + loops_read = sch.get_loops(block_read) + sch.reorder(*[loops_read[reindex_map[i]] for i, _ in enumerate(new_dst_layout)]) + + return block_read, unpack_list(input_shape), new_src_layout, new_dst_layout def auto_inline_into(sch: tvm.tir.Schedule, start_block: BlockRV) -> BlockRV: @@ -522,14 +525,14 @@ def cuda_layout_transform_schedule_rule( # ... # Read block will read from global memory coalesced at the start # Assume write to output global memory is coalesced in block_write - block_read = sch.cache_read(block, 0, "shared") + # block_read = sch.cache_read(block, 0, "shared") # Handle the case where there is an implicit reshape going on. # e.g. NCHW -> NCHW4c which is equivalent to reshaping NCHW # to NCcHW and then applying the new layout where the extant of c is 4. # Grab final input shape and src and dst layouts. - input_shape, src_layout, dst_layout = handle_block_implicit_reshape( - sch, block_read, input_shape, src_layout, dst_layout + block_read, input_shape, src_layout, dst_layout = create_cached_read( + sch, block, input_shape, src_layout, dst_layout ) # Try tile size 2,3...threads_per_warp as tile size of 1 has no coaslescing. diff --git a/python/tvm/meta_schedule/schedule/cuda/test.py b/python/tvm/meta_schedule/schedule/cuda/test.py new file mode 100644 index 000000000000..33d56361f491 --- /dev/null +++ b/python/tvm/meta_schedule/schedule/cuda/test.py @@ -0,0 +1,25 @@ +# Read step +print("read") +for ax0_0_0_ax0_1_0_ax0_1_1_fused_0_fused in range(2): + for ax0_1_0_ax0_1_1_fused_1_fused_0_ax0_1_0_ax0_1_1_fused_1_fused_1_fused in range(3): + for ax0_0_1_fused_0_ax0_0_1_fused_1_fused in range(3): + v0 = ( + ax0_0_0_ax0_1_0_ax0_1_1_fused_0_fused * (3) + ax0_0_1_fused_0_ax0_0_1_fused_1_fused + ) * (2) + ax0_1_0_ax0_1_1_fused_1_fused_0_ax0_1_0_ax0_1_1_fused_1_fused_1_fused + print( + f"block:{ax0_0_0_ax0_1_0_ax0_1_1_fused_0_fused} thread:{ax0_1_0_ax0_1_1_fused_1_fused_0_ax0_1_0_ax0_1_1_fused_1_fused_1_fused} -- index: {v0}" + ) + + +# Write step +print("write") +for ax0_0_0_ax0_1_0_ax0_1_1_fused_0_fused in range(2): + for ax0_ax1_fused_0 in range(3): + for ax0_ax1_fused_1 in range(3): + v_ax0 = (ax0_ax1_fused_0 * 3 + ax0_ax1_fused_1) // (4) + v_ax1 = ax0_0_0_ax0_1_0_ax0_1_1_fused_0_fused * (3) + ( + ax0_ax1_fused_0 * (3) + ax0_ax1_fused_1 + ) % (4) + print( + f"block:{ax0_0_0_ax0_1_0_ax0_1_1_fused_0_fused} thread:{ax0_ax1_fused_1}-- index: {v_ax0}, {v_ax1}" + ) From e70466477d9021b5e92c0d15a4423d11e9cfc9d3 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Wed, 15 Mar 2023 14:12:07 -0700 Subject: [PATCH 22/40] reorganize testing --- ...schedule_schedule_cuda_layout_transform.py | 512 +++++++++--------- 1 file changed, 265 insertions(+), 247 deletions(-) diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py index fb4452bc8637..318af4326542 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py +++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py @@ -10,67 +10,26 @@ import itertools import random -from typing import List, Optional, Tuple, Union +import tempfile +from typing import Callable, Dict, List, Optional, Tuple, Union import numpy as np +import pytest import tvm +import tvm.testing from tvm import meta_schedule, relay from tvm.meta_schedule.schedule.cuda.layout_transform import cuda_layout_transform_schedule_rule from tvm.relay.op import OpPattern from tvm.tir.schedule import BlockRV, ExprRV, LoopRV -# Create unary functions which apply ops with compatible fusion levels -def get_random_axis(data: relay.Expr): - rank = len(relay.transform.InferTypeLocal(data).shape) - return random.randint(0, rank - 1) - - -def apply_elemwise_clip(data: relay.Expr, min=0, max=10): - assert relay.op.get("clip").get_attr("TOpPattern") == OpPattern.ELEMWISE - return relay.clip(data, min, max) - - -def apply_broadcast_add(data: relay.Expr, val_to_add=5): - assert relay.op.get("add").get_attr("TOpPattern") == OpPattern.BROADCAST - type_info = relay.transform.InferTypeLocal(data) - return relay.add(data, relay.const(val_to_add, dtype=type_info.dtype)) - - -def apply_injective_concatenate(data: relay.Expr, axis=None): - if axis is None: - axis = get_random_axis(data) - assert relay.op.get("concatenate").get_attr("TOpPattern") == OpPattern.INJECTIVE - return relay.concatenate([data, data], axis) - - -def apply_comm_reduce_max(data: relay.Expr, axis=None): - if axis is None: - axis = get_random_axis(data) - assert relay.op.get("max").get_attr("TOpPattern") == OpPattern.COMM_REDUCE - - # Do this to maintain dimensions - return relay.add(data, relay.max(data, axis, keepdims=True)) - - -# Applying the actual layout transform will be different -def apply_layout_transform(data: relay.Expr, src_layout: str, dst_layout: str): - assert relay.op.get("layout_transform").get_attr("TOpPattern") == OpPattern.INJECTIVE - return relay.layout_transform(data, src_layout, dst_layout) - - -# These are the only levels of op which can possibly be fused with layout_transform (which injective) -extra_pattern_level_to_op = { - OpPattern.ELEMWISE: apply_elemwise_clip, - OpPattern.BROADCAST: apply_broadcast_add, - OpPattern.INJECTIVE: apply_injective_concatenate, - OpPattern.COMM_REDUCE: apply_comm_reduce_max, -} - - class PatchCustomLayoutTransformScheduleRule: - """Patch the custom layout transform schedule to test only specific tile sizes.""" + """Patch the custom layout transform schedule to test only specific tile sizes. + + If tile_sizes = [], then returns the default (non-tiled) schedule, otherwise + returns only the schedule with the given tiles. + """ FUNC_NAME = "meta_schedule.cuda.layout_transform" @@ -82,7 +41,9 @@ def __enter__(self, *args, **kwargs) -> None: self.old_func = tvm.get_global_func(self.FUNC_NAME) def new_layout_rule( - sch: tvm.tir.Schedule, block: BlockRV, tile_sizes: Optional[List[int]] = self.tile_sizes + sch: tvm.tir.Schedule, + block: BlockRV, + tile_sizes: Optional[List[int]] = self.tile_sizes, ) -> List[tvm.tir.Schedule]: return cuda_layout_transform_schedule_rule(sch, block, tile_sizes) @@ -92,217 +53,274 @@ def __exit__(self, *args, **kwargs) -> None: tvm.register_func(self.FUNC_NAME, self.old_func, override=True) -def create_relay_module( - input_shape: List[int], dtype: str, ops: List[Union[int, Tuple[str, str]]] -) -> tvm.IRModule: - """Create a relay module with the given string of ops. +class TestRandomRelayE2ECorrectness: + """Tests E2E correctness of layout transform schedule. - ops: - Applies the associated operators in order. If an integer, refers to applying - the unary operator from `extra_pattern_level_to_op` map. If a tuple, applies - a layout transform with the given (src_layout, dst_layout) + Randomly generates relay mod with layout transform and fusable ops. Checks the + layout transform task for correctness by comparing against its unscheduled result. """ - input_data = relay.var("input", shape=input_shape, dtype=dtype) - cur_data = input_data - for op_info in ops: - # Progressively build type info + # Create unary functions which apply ops with compatible fusion levels to layout transform + @staticmethod + def get_random_axis(data: relay.Expr): + rank = len(relay.transform.InferTypeLocal(data).shape) + return random.randint(0, rank - 1) + + @staticmethod + def apply_elemwise_clip(data: relay.Expr, min=0, max=10): + assert relay.op.get("clip").get_attr("TOpPattern") == OpPattern.ELEMWISE + return relay.clip(data, min, max) + + @staticmethod + def apply_broadcast_add(data: relay.Expr, val_to_add=5): + assert relay.op.get("add").get_attr("TOpPattern") == OpPattern.BROADCAST + type_info = relay.transform.InferTypeLocal(data) + return relay.add(data, relay.const(val_to_add, dtype=type_info.dtype)) + + @staticmethod + def apply_injective_concatenate(data: relay.Expr, axis=None): + if axis is None: + axis = TestRandomRelayE2ECorrectness.get_random_axis(data) + assert relay.op.get("concatenate").get_attr("TOpPattern") == OpPattern.INJECTIVE + return relay.concatenate([data, data], axis) + + @staticmethod + def apply_comm_reduce_max(data: relay.Expr, axis=None): + if axis is None: + axis = TestRandomRelayE2ECorrectness.get_random_axis(data) + assert relay.op.get("max").get_attr("TOpPattern") == OpPattern.COMM_REDUCE + + # Do this to maintain dimensions + return relay.add(data, relay.max(data, axis, keepdims=True)) + + @staticmethod + def get_map_pattern_level_to_op() -> Dict[OpPattern, Callable]: + # These are the only levels of op which can possibly be fused with layout_transform (which injective) + return { + OpPattern.ELEMWISE: TestRandomRelayE2ECorrectness.apply_elemwise_clip, + OpPattern.BROADCAST: TestRandomRelayE2ECorrectness.apply_broadcast_add, + OpPattern.INJECTIVE: TestRandomRelayE2ECorrectness.apply_injective_concatenate, + OpPattern.COMM_REDUCE: TestRandomRelayE2ECorrectness.apply_comm_reduce_max, + } + + @staticmethod + def apply_layout_transform(data: relay.Expr, src_layout: str, dst_layout: str): + assert relay.op.get("layout_transform").get_attr("TOpPattern") == OpPattern.INJECTIVE + return relay.layout_transform(data, src_layout, dst_layout) + + @staticmethod + def create_relay_module( + input_shape: List[int], dtype: str, ops: List[Union[OpPattern, Tuple[str, str]]] + ) -> tvm.IRModule: + """Create a relay module with the given string of ops. + + ops: + Applies the associated operators in order. If an integer, refers to applying + the unary operator from `extra_pattern_level_to_op` map. If a tuple, applies + a layout transform with the given (src_layout, dst_layout) + """ + input_data = relay.var("input", shape=input_shape, dtype=dtype) + + cur_data = input_data + for op_info in ops: + # Progressively build type info + relay.transform.InferTypeLocal(cur_data) + if isinstance(op_info, tuple): + # layout transform case + src_layout, dst_layout = op_info + cur_data = TestRandomRelayE2ECorrectness.apply_layout_transform( + cur_data, src_layout, dst_layout + ) + else: + cur_data = TestRandomRelayE2ECorrectness.get_map_pattern_level_to_op()[op_info]( + cur_data + ) + relay.transform.InferTypeLocal(cur_data) - if isinstance(op_info, tuple): - # layout transform case - src_layout, dst_layout = op_info - cur_data = apply_layout_transform(cur_data, src_layout, dst_layout) - else: - cur_data = extra_pattern_level_to_op[op_info](cur_data) - - relay.transform.InferTypeLocal(cur_data) - return tvm.IRModule.from_expr(cur_data) - - -def generate_test_case( - input_shape: List[int], - implicit_reshape_info: Optional[Tuple[int, int]], - dtype: str, - num_additional_ops: int, -) -> tvm.IRModule: - # Create layout transforms - rank = len(input_shape) - src_layout = "".join([chr(i + ord("A")) for i in range(rank)]) - - dst_layout = list(src_layout) - if implicit_reshape_info: - axis_to_reshape, size_new_dim = implicit_reshape_info - cur_dim = dst_layout[axis_to_reshape] - dst_layout[axis_to_reshape] = f"{cur_dim}{size_new_dim}{cur_dim.lower()}" - - random.shuffle(dst_layout) - while "".join(dst_layout) == src_layout: - random.shuffle(dst_layout) - dst_layout = "".join(dst_layout) - - op_choices = random.choices(list(extra_pattern_level_to_op.keys()), k=num_additional_ops) - op_choices.append((src_layout, dst_layout)) - - random.shuffle(op_choices) - return create_relay_module(input_shape, dtype, op_choices) - - -def extract_layout_transform_task( - mod: tvm.IRModule, target: tvm.target.Target -) -> Tuple[tvm.IRModule, tvm.IRModule]: - """Given a relay IRModule, return the PrimFunc IRModule with fused layout transform task.""" - extracted_tasks = meta_schedule.relay_integration.extract_tasks( - mod, - target, - {}, - pass_config={ - "relay.backend.use_meta_schedule": True, - "relay.FuseOps.max_depth": 30, - "relay.backend.tir_converter": "default", - }, - ) - task_of_interest = None - for task in extracted_tasks: - if "layout_transform" in task.task_name: - task_of_interest = task - break - assert task_of_interest is not None - - # Fused layout transform task - relay_mod = task_of_interest.mod - dispatched_mod = task_of_interest.dispatched[0] - return relay_mod, dispatched_mod - - -def run_primfunc( - primfunc_mod: tvm.IRModule, target: tvm.target.Target, input_tensors: List[tvm.nd.NDArray] -): - with tvm.transform.PassContext( - config={ - "relay.backend.use_meta_schedule": True, - "relay.backend.use_meta_schedule_dispatch": False, - "relay.FuseOps.max_depth": 30, - }, - opt_level=3, - ): - lib = tvm.build(primfunc_mod, target=target) - lib(*input_tensors) - - -def verify_layout_transform_task( - relay_mod: tvm.IRModule, - dispatched_mod: tvm.IRModule, - target: tvm.target.Target, - tile_sizes: List[int], -): - """Given a layout transform primfunc, tests the given tile_sizes and verifies output matches.""" - space_generator = meta_schedule.space_generator.PostOrderApply( - sch_rules=meta_schedule.schedule_rule.schedule_rule.create("cuda"), - postprocs=meta_schedule.postproc.postproc.create("cuda"), - mutator_probs=meta_schedule.mutator.mutator.create("cuda"), - ) - device = tvm.cuda(0) + return tvm.IRModule.from_expr(cur_data) + + @staticmethod + def generate_test_case( + input_shape: List[int], + implicit_reshape_info: Optional[Tuple[int, int]], + dtype: str, + num_additional_ops: int, + ) -> tvm.IRModule: + """Creates a random layout transform module with up to num_additional_ops fused.""" + # Create layout transforms + rank = len(input_shape) + + # src_layout is a string like ABCDEFG... with length as rank + src_layout = "".join([chr(i + ord("A")) for i in range(rank)]) + + # dst_layout is randomly shuffled src_layout, potentially after adding split axis + dst_layout = list(src_layout) + if implicit_reshape_info: + axis_to_reshape, size_new_dim = implicit_reshape_info + cur_dim = dst_layout[axis_to_reshape] + dst_layout[axis_to_reshape] = f"{cur_dim}" + dst_layout.append(f"{size_new_dim}{cur_dim.lower()}") - func_type = relay.transform.InferTypeLocal(relay_mod[relay_mod.get_global_vars()[0]]) - input_tensors = [] - for input_type in func_type.arg_types: - orig_input_np = np.random.uniform(0, 10, size=list(map(int, input_type.shape))).astype( - input_type.dtype + random.shuffle(dst_layout) + while "".join(dst_layout) == src_layout: + random.shuffle(dst_layout) + dst_layout = "".join(dst_layout) + + # Randomly sample a list of potentially fusable ops to layout transform + op_order = random.choices( + list(TestRandomRelayE2ECorrectness.get_map_pattern_level_to_op().keys()), + k=num_additional_ops, ) - input_tensors.append(tvm.nd.array(orig_input_np, device)) - ret_type = func_type.ret_type - def get_output_tensor() -> Tuple[tvm.nd.NDArray, tvm.nd.NDArray]: - numpy_init = np.random.uniform(0, 1000, size=list(map(int, ret_type.shape))).astype( - ret_type.dtype + # Append tuple, representing layout transfomr from src --> dst layout + op_order.append((src_layout, dst_layout)) + + random.shuffle(op_order) + return TestRandomRelayE2ECorrectness.create_relay_module(input_shape, dtype, op_order) + + @staticmethod + def extract_layout_transform_task( + mod: tvm.IRModule, target: tvm.target.Target + ) -> meta_schedule.ExtractedTask: + """Given a relay IRModule, return the PrimFunc IRModule with fused layout transform task.""" + extracted_tasks = meta_schedule.relay_integration.extract_tasks( + mod, + target, + {}, + pass_config={"relay.backend.use_meta_schedule": True}, ) - return tvm.nd.array(numpy_init, device) - - def run_and_get_output(tile_size: Optional[int]) -> np.ndarray: - # By setting the tile_sizes to search to nothing, the layout transform rule just returns - # the original schedule. - tile_size_input = [] if tile_size is None else [tile_size] - with PatchCustomLayoutTransformScheduleRule(tile_sizes=tile_size_input): - tune_context = meta_schedule.TuneContext( - mod=dispatched_mod, - target=target, - space_generator=space_generator, - search_strategy=meta_schedule.search_strategy.create(), + task_of_interest = None + for task in extracted_tasks: + if "layout_transform" in task.task_name: + task_of_interest = task + break + assert task_of_interest is not None + return task_of_interest + + @staticmethod + def run_primfunc( + primfunc_mod: tvm.IRModule, target: tvm.target.Target, input_tensors: List[tvm.nd.NDArray] + ): + """Compile and run the primfunc with the given input tensors.""" + with tvm.transform.PassContext( + config={"relay.backend.use_meta_schedule": True}, + opt_level=3, + ): + lib = tvm.build(primfunc_mod, target=target) + lib(*input_tensors) + + @staticmethod + def get_primfunc(extracted_task: meta_schedule.ExtractedTask, tile_size: Optional[int]): + with PatchCustomLayoutTransformScheduleRule( + tile_sizes=[] if tile_size is None else [tile_size] + ): + with tempfile.TemporaryDirectory() as tmpdir: + ( + tune_contexts, + _, + ) = meta_schedule.relay_integration.extracted_tasks_to_tune_contexts( + [extracted_task], + tmpdir, + ) + tune_contexts[0].pre_tuning(1) + candidates = tune_contexts[0].generate_measure_candidates() + primfunc = candidates[0].sch.mod["main"] + return primfunc + + @staticmethod + def verify_layout_transform_task( + extracted_task: meta_schedule.ExtractedTask, + target: tvm.target.Target, + tile_sizes: List[int], + ): + """Given a layout transform task, tests the given tile_sizes and verifies output matches.""" + device = tvm.cuda(0) + relay_mod = extracted_task.mod + + # Create and cache inputs + func_type = relay.transform.InferTypeLocal(relay_mod[relay_mod.get_global_vars()[0]]) + input_tensors = [] + for input_type in func_type.arg_types: + orig_input_np = np.random.uniform(0, 10, size=list(map(int, input_type.shape))).astype( + input_type.dtype + ) + orig_input_np = np.arange(0, orig_input_np.size, dtype=input_type.dtype).reshape( + orig_input_np.shape + ) + input_tensors.append(tvm.nd.array(orig_input_np, device)) + ret_type = func_type.ret_type + + def get_output_tensor() -> Tuple[tvm.nd.NDArray, tvm.nd.NDArray]: + numpy_init = np.random.uniform(0, 1000, size=list(map(int, ret_type.shape))).astype( + ret_type.dtype + ) + return tvm.nd.array(numpy_init, device) + + def run_and_get_output(tile_size: Optional[int]) -> np.ndarray: + returned_primfunc = TestRandomRelayE2ECorrectness.get_primfunc( + extracted_task, tile_size ) - tune_context.pre_tuning(32) - returned_primfunc = tune_context.generate_measure_candidates()[0].sch.mod output_tensor = get_output_tensor() - run_primfunc(returned_primfunc, target, [*input_tensors, output_tensor]) - # print(returned_primfunc) + TestRandomRelayE2ECorrectness.run_primfunc( + returned_primfunc, target, [*input_tensors, output_tensor] + ) return output_tensor.numpy() - # Passing None, we basically do not apply the custom rule we have created. - ground_truth_np = run_and_get_output(None) - for tile_size in tile_sizes: - experimental_result_np = run_and_get_output(tile_size) - - np.testing.assert_allclose(ground_truth_np, experimental_result_np) - - -def generate_all_test_case( - # Each has ~10k elements - input_shapes: List[List[int]] = [ - [12, 48, 18], - [890, 14], - [10, 12, 2, 5, 3, 3], - ], - implicit_reshape_conditions: List[Optional[Tuple[int, int]]] = [None, (0, 2), (1, 2)], - dtypes: List[str] = ["float32", "float16"], - num_additional_ops: int = 0, - tile_sizes: List[int] = [32, 20, 19], - repeats_per_condition=10, -): - # Small numbers which should work for nearly every (modern-ish) gpu. - target = tvm.target.Target( - "cuda -max_threads_per_block=32 -max_num_threads=128 -thread_warp_size=32 -max_shared_memory_per_block=8192 -registers_per_block=1024" + # Passing None, we basically do not apply the custom rule we have created + # and instead use the old default schedule which is the ground truth. + ground_truth_np = run_and_get_output(None) + + for tile_size in tile_sizes: + experimental_np = run_and_get_output(tile_size) + np.testing.assert_allclose(ground_truth_np, experimental_np) + + input_shape, implicit_reshape_info, dtype, tile_sizes = tvm.testing.parameters( + *itertools.product( + # InputShapes: Each has ~10k elements, should take single microseconds on modern gpu + [ + [12, 48, 18], + [890, 14], + [10, 12, 2, 5, 3, 3], + ], + # Implicit reshape conditions. + # None is do no implicit reshape, (0, 2) means divide axis 0 in half, e.g. AB --> A2aB + [None, (0, 2), (1, 2)], + # Dtypes to test, should not matter that much + ["float16"], + # Tile sizes to try + [[8, 7]], + ) ) - for _ in range(repeats_per_condition): - for input_shape, implicit_reshape_info, dtype in itertools.product( - input_shapes, implicit_reshape_conditions, dtypes - ): - # Generate random module of fusable ops + layout transform and extract fused layout transform task - full_mod = generate_test_case( - input_shape, implicit_reshape_info, dtype, num_additional_ops - ) - # Fused layout transform task - relay_mod, dispatched_mod = extract_layout_transform_task(full_mod, target) + @tvm.testing.requires_gpu + def test_all_test_case( + self, + input_shape, + implicit_reshape_info, + dtype, + tile_sizes, + # number of non-layout transform ops to include and may be fused + num_additional_ops: int = 5, + ): + """Tests the product of all conditions `repeat_per_condition` times.""" + # Small gpu parameters which should work for nearly every (modern-ish) gpu. + target = tvm.target.Target( + "cuda -max_threads_per_block=32 -max_num_threads=128 -thread_warp_size=32 -max_shared_memory_per_block=8192 -registers_per_block=1024" + ) + + # Generate random module of fusable ops + layout transform and extract fused layout transform task + full_mod = self.generate_test_case( + input_shape, implicit_reshape_info, dtype, num_additional_ops + ) - print(relay_mod) - verify_layout_transform_task(relay_mod, dispatched_mod, target, tile_sizes) - print("Verified!") - print() + # Fused layout transform task + extracted_task = self.extract_layout_transform_task(full_mod, target) + print(full_mod) + print(extracted_task.task_name) + self.verify_layout_transform_task(extracted_task, target, tile_sizes) + print("Done!") + print() if __name__ == "__main__": - # mod = create_relay_module([12, 48, 18], "float32", [("ABC", "B2bAC"), 2]) - # extracted_tasks = meta_schedule.relay_integration.extract_tasks( - # mod, - # tvm.target.Target("cuda"), - # {}, - # pass_config={ - # "relay.backend.use_meta_schedule": True, - # "relay.FuseOps.max_depth": 30, - # "relay.backend.tir_converter": "default", - # }, - # ) - # task_of_interest = None - # for task in extracted_tasks: - # if "layout_transform" in task.task_name: - # task_of_interest = task - # break - # assert task_of_interest is not None - - # # # Fused layout transform task - # dispatched_mod = task_of_interest.dispatched[0] - # base_schedule = tvm.tir.Schedule(dispatched_mod) - # verify_schedule(base_schedule, [30, 20, 19]) - - # exit() - - generate_all_test_case() + tvm.testing.main() From 67c2db9aa00e6aa6a99b0b4b4bfee1aa37600ed2 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Wed, 15 Mar 2023 14:23:49 -0700 Subject: [PATCH 23/40] more cleanup --- ...schedule_schedule_cuda_layout_transform.py | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py index 318af4326542..4e16fc9c16f7 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py +++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py @@ -273,22 +273,31 @@ def run_and_get_output(tile_size: Optional[int]) -> np.ndarray: for tile_size in tile_sizes: experimental_np = run_and_get_output(tile_size) np.testing.assert_allclose(ground_truth_np, experimental_np) + assert False - input_shape, implicit_reshape_info, dtype, tile_sizes = tvm.testing.parameters( + ( + input_shape, + implicit_reshape_info, + dtype, + tile_sizes, + num_additional_ops, + ) = tvm.testing.parameters( *itertools.product( - # InputShapes: Each has ~10k elements, should take single microseconds on modern gpu + # input_shape: Each has ~10k elements, should take single microseconds on modern gpu [ [12, 48, 18], [890, 14], [10, 12, 2, 5, 3, 3], ], - # Implicit reshape conditions. + # implicit_reshape_info: Implicit reshape conditions. # None is do no implicit reshape, (0, 2) means divide axis 0 in half, e.g. AB --> A2aB [None, (0, 2), (1, 2)], - # Dtypes to test, should not matter that much + # dtype: dtypes to test, should not matter that much ["float16"], - # Tile sizes to try + # tile_sizes: Tile sizes to try [[8, 7]], + # num_additional_ops: number of non-layout transform ops to include and may be fused + [5], ) ) @@ -299,8 +308,7 @@ def test_all_test_case( implicit_reshape_info, dtype, tile_sizes, - # number of non-layout transform ops to include and may be fused - num_additional_ops: int = 5, + num_additional_ops, ): """Tests the product of all conditions `repeat_per_condition` times.""" # Small gpu parameters which should work for nearly every (modern-ish) gpu. @@ -315,11 +323,7 @@ def test_all_test_case( # Fused layout transform task extracted_task = self.extract_layout_transform_task(full_mod, target) - print(full_mod) - print(extracted_task.task_name) self.verify_layout_transform_task(extracted_task, target, tile_sizes) - print("Done!") - print() if __name__ == "__main__": From 6e1ea6bc0a47476e2b57f09583be96905009ca40 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Wed, 15 Mar 2023 14:57:46 -0700 Subject: [PATCH 24/40] remove forced false --- .../test_meta_schedule_schedule_cuda_layout_transform.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py index 4e16fc9c16f7..4058a7644548 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py +++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py @@ -273,7 +273,6 @@ def run_and_get_output(tile_size: Optional[int]) -> np.ndarray: for tile_size in tile_sizes: experimental_np = run_and_get_output(tile_size) np.testing.assert_allclose(ground_truth_np, experimental_np) - assert False ( input_shape, From 8eace3073583a8249b87ed6677680e6965104ab5 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Wed, 15 Mar 2023 15:27:20 -0700 Subject: [PATCH 25/40] use the proper dispatcher --- python/tvm/relay/op/strategy/cuda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index 6111ceed0f98..0c39d12f318d 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -1403,7 +1403,7 @@ def layout_transform_strategy(attrs, inputs, out_type, target): strategy = _op.OpStrategy() strategy.add_implementation( wrap_compute_layout_transform(topi.layout_transform, schedule_rule="layout_transform"), - wrap_topi_schedule(topi.cuda.schedule_injective), + schedule_injective, name="layout_transform.cuda", ) return strategy From 8eb6f8b108f100664a37df477ff25f785e9e07b4 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Thu, 16 Mar 2023 09:49:15 -0700 Subject: [PATCH 26/40] update test, make default schedule rule None --- python/tvm/relay/op/strategy/generic.py | 2 +- tests/python/unittest/test_meta_schedule_relay_integration.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py index d53cd045383b..2883e5e1fb77 100644 --- a/python/tvm/relay/op/strategy/generic.py +++ b/python/tvm/relay/op/strategy/generic.py @@ -2070,7 +2070,7 @@ def layout_transform_strategy(attrs, inputs, out_type, target): return strategy -def wrap_compute_layout_transform(topi_compute, schedule_rule=""): +def wrap_compute_layout_transform(topi_compute, schedule_rule="None"): """Wrap layout transform compute""" def _compute_layout_transform(attrs, inputs, output_type): diff --git a/tests/python/unittest/test_meta_schedule_relay_integration.py b/tests/python/unittest/test_meta_schedule_relay_integration.py index f1d74348db17..ee148db94d0a 100644 --- a/tests/python/unittest/test_meta_schedule_relay_integration.py +++ b/tests/python/unittest/test_meta_schedule_relay_integration.py @@ -20,6 +20,7 @@ import numpy as np import pytest + import tvm import tvm.testing from tvm import IRModule @@ -420,6 +421,7 @@ def main( # type: ignore ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4]) T.reads(placeholder[ax0, ax1 * T.int64(3) + ax4, ax2, ax3]) T.writes(T_layout_trans[ax0, ax1, ax2, ax3, ax4]) + T.block_attr({"dst_layout": "NCHW3c", "input_shape": [1, 3, 16, 16], "schedule_rule": "None", "src_layout": "NCHW"}) T_layout_trans[ax0, ax1, ax2, ax3, ax4] = T.if_then_else( ax0 < T.int64(1) and ax1 * T.int64(3) + ax4 < T.int64(3) and ax2 < T.int64(16) and ax3 < T.int64(16), # type: ignore placeholder[ax0, ax1 * T.int64(3) + ax4, ax2, ax3], @@ -440,6 +442,7 @@ def main(placeholder: T.Buffer((T.int64(1), T.int64(2), T.int64(16), T.int64(16) ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) T.reads(placeholder[ax0, ax1 // T.int64(4), ax2, ax3, ax1 % T.int64(4)]) # type: ignore T.writes(T_layout_trans[ax0, ax1, ax2, ax3]) + T.block_attr({"dst_layout": "NCHW", "input_shape": [1, 2, 16, 16, 4], "schedule_rule": "None", "src_layout": "NCHW4c"}) T_layout_trans[ax0, ax1, ax2, ax3] = T.if_then_else(ax0 < T.int64(1) and ax1 < T.int64(8) and ax2 < T.int64(16) and ax3 < T.int64(16), placeholder[ax0, ax1 // T.int64(4), ax2, ax3, ax1 % T.int64(4)], T.float32(0), dtype="float32") # type: ignore @tvm.script.ir_module From 5546079ce637a93aa0f14ffa688715fdd3d74525 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Thu, 16 Mar 2023 10:49:52 -0700 Subject: [PATCH 27/40] linting --- .../schedule/cuda/layout_transform.py | 76 ++++++++++++------- .../tvm/meta_schedule/schedule/cuda/test.py | 25 ------ python/tvm/relay/op/strategy/cuda.py | 2 +- .../postproc/rewrite_cooperative_fetch.cc | 8 +- ...schedule_schedule_cuda_layout_transform.py | 17 +++++ 5 files changed, 71 insertions(+), 57 deletions(-) delete mode 100644 python/tvm/meta_schedule/schedule/cuda/test.py diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py index aa2378443602..8cee6ea51162 100644 --- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py +++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py @@ -1,9 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""layout_transform scheduling rule for cuda.""" + import math from collections import deque from typing import List, Optional, Sequence, Tuple, Union import tvm -from tvm import meta_schedule, topi +from tvm import meta_schedule from tvm.tir.schedule import BlockRV, ExprRV, LoopRV ## Tiling layout transforms: @@ -62,7 +80,7 @@ # T.grid(64_ar, 1_br, 8_cr, 1_dr, 32_dim0, 32_dim1) # # Which allows us to read a tile with our wanted properties. -# For writing we use the existing analysis infrastructure to generate the proper structure for writing. +# For writing we use the existing analysis infrastructure to generate the structure for writing. def tile_layout_transform( @@ -110,8 +128,8 @@ def tile_layout_transform( def pad_dimension_to_at_least_number(loop: LoopRV, requested_size: int): """E.g. if loop has extant of 8 but we want 10, returns size 10 loop with padding.""" - l1, l2 = sch.split(loop, [None, requested_size]) - return sch.fuse(l1, l2) + left, right = sch.split(loop, [None, requested_size]) + return sch.fuse(left, right) def pad_dimension_to_factor_of_tile_size( loop: LoopRV, initial_size: int, tile_size: int = tile_size @@ -213,7 +231,7 @@ def factor_dim_in_order( return loops, cur_loop_extants def get_high_level_loop_structure( - block_read: BlockRV, input_shape: Sequence[int], src_layout: str, dst_layout: str + block_read: BlockRV, input_shape: List[int], src_layout: str, dst_layout: str ): """Runs the factorization described above.""" # index 0 ... rank - 1 will always correspond to original loops @@ -245,9 +263,11 @@ def get_high_level_loop_structure( # Same thing with dim1 # [:rank + 1], since we placed dim0_loop_tiled in the end which we want to keep loops, cur_loop_extants = factor_dim_in_order( - ( - src_layout.index(dst_layout[loop_index_dst]) - for loop_index_dst in range(rank - 1, -1, -1) + list( + ( + src_layout.index(dst_layout[loop_index_dst]) + for loop_index_dst in range(rank - 1, -1, -1) + ) ), loops, cur_loop_extants, @@ -307,14 +327,15 @@ def create_cached_read( orig_input_shape: Sequence[int], orig_src_layout: str, orig_dst_layout: str, -) -> Tuple[List[int], str, str]: +) -> Tuple[BlockRV, List[int], str, str]: """ Makes layout transform schedule applicable to implicit reshape case. - Layout transform allows semantics like NCHW --> NCHW4c. Which involves splitting the original C axis into contiguous - 4-element chunks. This axis is then moved to the end (NCHWc). This is guaranteed by the operator to be done without - additional padding. To handle this we just split the associating axis (prev. type checking ensures C is divisible by 4) - in src_layout found in block_read. E.g. NCHW -> NCHW4c now becomes NC4cHW -> NCHW4c. + Layout transform allows semantics like NCHW --> NCHW4c. Which involves splitting the original C + axis into contiguous 4-element chunks. This axis is then moved to the end (NCHWc). This is + guaranteed by the operator to be done without additional padding. To handle this we just split + the associating axis (prev. type checking ensures C is divisible by 4)in src_layout found in + block_read. E.g. NCHW -> NCHW4c now becomes NC4cHW -> NCHW4c. Note: NCHW4c --> NCHW is not allowed, so the only numeric digits will be in dst. @@ -371,8 +392,8 @@ def create_cached_read( # Calculate final input shapes, each of these are a single element for unsplit dims # and tuples for split dims associated with the two new axis - input_shape: List[Union[int, Tuple]] = [i for i in orig_input_shape] - new_src_layout: List[Union[str, Tuple]] = [c for c in orig_src_layout] + input_shape: List[Union[int, Tuple]] = list(orig_input_shape) + new_src_layout: List[Union[str, Tuple]] = list(orig_src_layout) for src_layout_split_index, split_factor in split_dimensions: dimension_name = new_src_layout[src_layout_split_index] new_src_layout[src_layout_split_index] = (dimension_name, dimension_name.lower()) @@ -382,8 +403,8 @@ def create_cached_read( ) # Unpack any tuples introduced via appending - def unpack_list(target_list) -> list: - output = [] + def unpack_list(target_list) -> List: + output: List = [] for ele in target_list: if isinstance(ele, tuple): output.extend(ele) @@ -433,26 +454,27 @@ def auto_inline_into(sch: tvm.tir.Schedule, start_block: BlockRV) -> BlockRV: ret: The new block inlined into it's consumers. """ + # Rules defined by DefaultCUDA schedule_rule set. + autoinline_rule = meta_schedule.schedule_rule.AutoInline( + into_producer=True, + into_consumer=False, + inline_const_tensor=True, + disallow_if_then_else=False, + require_injective=False, + require_ordered=False, + ) + fringe = deque(sch.get_consumers(start_block)) visited = set() while len(fringe) > 0: cur_block = fringe.popleft() if cur_block in visited: continue - else: - visited.add(cur_block) + visited.add(cur_block) consumer_blocks = sch.get_consumers(cur_block) fringe.extend(consumer_blocks) - autoinline_rule = meta_schedule.schedule_rule.AutoInline( - into_producer=True, - into_consumer=False, - inline_const_tensor=True, - disallow_if_then_else=False, - require_injective=False, - require_ordered=False, - ) sch = autoinline_rule.apply(sch, cur_block)[0] diff --git a/python/tvm/meta_schedule/schedule/cuda/test.py b/python/tvm/meta_schedule/schedule/cuda/test.py deleted file mode 100644 index 33d56361f491..000000000000 --- a/python/tvm/meta_schedule/schedule/cuda/test.py +++ /dev/null @@ -1,25 +0,0 @@ -# Read step -print("read") -for ax0_0_0_ax0_1_0_ax0_1_1_fused_0_fused in range(2): - for ax0_1_0_ax0_1_1_fused_1_fused_0_ax0_1_0_ax0_1_1_fused_1_fused_1_fused in range(3): - for ax0_0_1_fused_0_ax0_0_1_fused_1_fused in range(3): - v0 = ( - ax0_0_0_ax0_1_0_ax0_1_1_fused_0_fused * (3) + ax0_0_1_fused_0_ax0_0_1_fused_1_fused - ) * (2) + ax0_1_0_ax0_1_1_fused_1_fused_0_ax0_1_0_ax0_1_1_fused_1_fused_1_fused - print( - f"block:{ax0_0_0_ax0_1_0_ax0_1_1_fused_0_fused} thread:{ax0_1_0_ax0_1_1_fused_1_fused_0_ax0_1_0_ax0_1_1_fused_1_fused_1_fused} -- index: {v0}" - ) - - -# Write step -print("write") -for ax0_0_0_ax0_1_0_ax0_1_1_fused_0_fused in range(2): - for ax0_ax1_fused_0 in range(3): - for ax0_ax1_fused_1 in range(3): - v_ax0 = (ax0_ax1_fused_0 * 3 + ax0_ax1_fused_1) // (4) - v_ax1 = ax0_0_0_ax0_1_0_ax0_1_1_fused_0_fused * (3) + ( - ax0_ax1_fused_0 * (3) + ax0_ax1_fused_1 - ) % (4) - print( - f"block:{ax0_0_0_ax0_1_0_ax0_1_1_fused_0_fused} thread:{ax0_ax1_fused_1}-- index: {v_ax0}, {v_ax1}" - ) diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index 0c39d12f318d..65573321f76c 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -1399,7 +1399,7 @@ def dft_strategy_cuda(attrs, inputs, out_type, target): @layout_transform_strategy.register(["cuda", "gpu"]) -def layout_transform_strategy(attrs, inputs, out_type, target): +def layout_transform_strategy_cuda(attrs, inputs, out_type, target): strategy = _op.OpStrategy() strategy.add_implementation( wrap_compute_layout_transform(topi.layout_transform, schedule_rule="layout_transform"), diff --git a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc index 1cf0d893d3e6..8cdb6a1e92a6 100644 --- a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc +++ b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc @@ -44,10 +44,10 @@ Optional ParseThreadBinding(const Schedule& sch, const Instruction& ins return Downcast(sch->Get(Downcast(inst->inputs[0]))->extent); } catch (const std::exception& e) { // This can occur if in a schedule we manually bind threads in the middle of a schedule - // and then later modify the schedule. As the passed in schedule is after running the entire trace - // the bound loop may be moved around in the IRModule. - // TODO: apply trace one inst at a time so schedule state is always accurate to instruction - LOG(DEBUG) << "Failed to calculate extent so skipping RewriteCooperativeFetching. Error " << e.what(); + // and then later modify the schedule. As the passed in schedule is after running the entire + // trace the bound loop may be moved around in the IRModule. + LOG(DEBUG) << "Failed to calculate extent so skipping RewriteCooperativeFetching. Error " + << e.what(); return NullOpt; } } diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py index 4058a7644548..35dd454b9195 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py +++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + # Edge Cases: # 1. Fusion with ops # 2. Fusion with ops From a3729c9c24ab2b5cc346f3f77d9924664466bf63 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Thu, 16 Mar 2023 11:20:21 -0700 Subject: [PATCH 28/40] fix mypy errors --- .../schedule/cuda/layout_transform.py | 29 +++++++++---------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py index 8cee6ea51162..2c9b87790dbb 100644 --- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py +++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py @@ -18,7 +18,7 @@ import math from collections import deque -from typing import List, Optional, Sequence, Tuple, Union +from typing import List, Optional, Tuple, Union import tvm from tvm import meta_schedule @@ -212,11 +212,11 @@ def spin_out_factor( return loops, loop_extants, factor_needed def factor_dim_in_order( - indices: Sequence[int], + indices: List[int], loops: List[LoopRV], cur_loop_extants: List[int], work_needed_inner_loop: int = tile_size, - ) -> Tuple[List[LoopRV], Sequence[int]]: + ) -> Tuple[List[LoopRV], List[int]]: """Factors out the loops in the order of indices until we reach needed work. Adds new loop factors to the back in reverse order of access. Returns new list @@ -242,7 +242,7 @@ def get_high_level_loop_structure( # Factor dim0 tile size and fuse things together loops, cur_loop_extants = factor_dim_in_order( - range(rank - 1, -1, -1), + list(range(rank - 1, -1, -1)), loops, cur_loop_extants, work_needed_inner_loop=tile_size, @@ -324,7 +324,7 @@ def get_high_level_loop_structure( def create_cached_read( sch: tvm.tir.Schedule, block_write: BlockRV, - orig_input_shape: Sequence[int], + orig_input_shape: List[int], orig_src_layout: str, orig_dst_layout: str, ) -> Tuple[BlockRV, List[int], str, str]: @@ -395,10 +395,10 @@ def create_cached_read( input_shape: List[Union[int, Tuple]] = list(orig_input_shape) new_src_layout: List[Union[str, Tuple]] = list(orig_src_layout) for src_layout_split_index, split_factor in split_dimensions: - dimension_name = new_src_layout[src_layout_split_index] + dimension_name = orig_src_layout[src_layout_split_index] new_src_layout[src_layout_split_index] = (dimension_name, dimension_name.lower()) input_shape[src_layout_split_index] = ( - input_shape[src_layout_split_index] // split_factor, + orig_input_shape[src_layout_split_index] // split_factor, split_factor, ) @@ -412,18 +412,17 @@ def unpack_list(target_list) -> List: output.append(ele) return output - new_src_layout = unpack_list(new_src_layout) - new_src_layout = "".join(new_src_layout) - new_dst_layout = "".join(new_dst_layout) + new_src_layout_str = "".join(unpack_list(new_src_layout)) + new_dst_layout_str = "".join(unpack_list(new_dst_layout)) # Write block loop extants match - reindex_map = [new_src_layout.index(dim) for dim in new_dst_layout] + reindex_map = [new_src_layout_str.index(dim) for dim in new_dst_layout_str] block_read = sch.reindex_cache_read( block_write, read_buffer_index=0, index_map=tvm.tir.IndexMap.from_func( lambda *loops: [loops[reindex_map[i]] for i, _ in enumerate(loops)], - ndim=len(new_src_layout), + ndim=len(new_src_layout_str), ), storage_scope="shared", ) @@ -431,9 +430,9 @@ def unpack_list(target_list) -> List: # While the above will have the shared memory buffer match the reshaped input tensor # the loops still match those of the write/output loop/buffer. Match the src layout instead loops_read = sch.get_loops(block_read) - sch.reorder(*[loops_read[reindex_map[i]] for i, _ in enumerate(new_dst_layout)]) + sch.reorder(*[loops_read[reindex_map[i]] for i, _ in enumerate(new_dst_layout_str)]) - return block_read, unpack_list(input_shape), new_src_layout, new_dst_layout + return block_read, unpack_list(input_shape), new_src_layout_str, new_dst_layout_str def auto_inline_into(sch: tvm.tir.Schedule, start_block: BlockRV) -> BlockRV: @@ -559,7 +558,7 @@ def cuda_layout_transform_schedule_rule( # Try tile size 2,3...threads_per_warp as tile size of 1 has no coaslescing. if testing_tile_sizes is None: - tile_sizes = range(2, get_max_tile_size() + 1) + tile_sizes = list(range(2, get_max_tile_size() + 1)) else: tile_sizes = testing_tile_sizes From bd5707745a80faeaf3e99a9a7827ae0f2548f4e6 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Thu, 16 Mar 2023 11:28:52 -0700 Subject: [PATCH 29/40] clean up --- .../schedule/cuda/layout_transform.py | 29 ++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py index 2c9b87790dbb..7f0c8a1a7e82 100644 --- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py +++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py @@ -124,6 +124,12 @@ def tile_layout_transform( tile_size: The tile size of read and writes. There will be tile_size threads per block, each of which reads up to tile_size elements. + + Returns + ------- + ret: + A tuple of the block that writes to global memory, and the block that reads from + global memory. """ def pad_dimension_to_at_least_number(loop: LoopRV, requested_size: int): @@ -329,7 +335,11 @@ def create_cached_read( orig_dst_layout: str, ) -> Tuple[BlockRV, List[int], str, str]: """ - Makes layout transform schedule applicable to implicit reshape case. + Creates the cached read block with expected structure. + + Loop extants should follow the input shape closely. E.g. if the input is [2, 6, 8], we + expect our loop structure to be T.grid(2, 6, 8). Possibly reshape to handle implicit reshapes, + in which case we will match the implicit reshape shape. Layout transform allows semantics like NCHW --> NCHW4c. Which involves splitting the original C axis into contiguous 4-element chunks. This axis is then moved to the end (NCHWc). This is @@ -363,8 +373,8 @@ def create_cached_read( Returns ------- ret: - A tuple of the new input shape of shared memory buffer, the new src_layout and - new dst_layout string. + A tuple of the cached read block, new input shape of shared memory buffer, + the new src_layout, and new dst_layout string. """ # Figure out split dimensions, entries are (loop index in src_layout, split amount) split_dimensions: List[Tuple[int, int]] = [] @@ -431,7 +441,6 @@ def unpack_list(target_list) -> List: # the loops still match those of the write/output loop/buffer. Match the src layout instead loops_read = sch.get_loops(block_read) sch.reorder(*[loops_read[reindex_map[i]] for i, _ in enumerate(new_dst_layout_str)]) - return block_read, unpack_list(input_shape), new_src_layout_str, new_dst_layout_str @@ -529,14 +538,15 @@ def cuda_layout_transform_schedule_rule( schedules = [] # Always include the default schedules which will be handled via AutoBind schedule rule + # Except during testing if not testing_tile_sizes: schedules.append(sch) + sch = sch.copy() # Inline consumers of the layout transform into the layout transform block. # Normally default for injective schedules but must manually be called in new schedule rule - # as we introduce a new block under the custom schedule rule which is not taken into account - # during search space generation. TODO: rectify this. + # for consumers of the layout transform. TODO(AndrewZhaoLuo): Figure out why this is the case. auto_inline_into(sch, block) # Setup up basic structure of schedule of creating read into shared mem, before applying tiling @@ -546,12 +556,11 @@ def cuda_layout_transform_schedule_rule( # ... # Read block will read from global memory coalesced at the start # Assume write to output global memory is coalesced in block_write - # block_read = sch.cache_read(block, 0, "shared") - - # Handle the case where there is an implicit reshape going on. + # + # This also handles the case where there is an implicit reshape going on. # e.g. NCHW -> NCHW4c which is equivalent to reshaping NCHW # to NCcHW and then applying the new layout where the extant of c is 4. - # Grab final input shape and src and dst layouts. + # Grab final input shape and src and dst layouts with possible implicit reshape. block_read, input_shape, src_layout, dst_layout = create_cached_read( sch, block, input_shape, src_layout, dst_layout ) From 7eb21ad1b7697f85729eea2e187ce234681a13f0 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Thu, 16 Mar 2023 12:25:07 -0700 Subject: [PATCH 30/40] manual test cases --- .../schedule/cuda/layout_transform.py | 10 +- ...schedule_schedule_cuda_layout_transform.py | 160 +++++++++++++++++- 2 files changed, 157 insertions(+), 13 deletions(-) diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py index 7f0c8a1a7e82..c51c8ed60f77 100644 --- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py +++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py @@ -426,21 +426,21 @@ def unpack_list(target_list) -> List: new_dst_layout_str = "".join(unpack_list(new_dst_layout)) # Write block loop extants match - reindex_map = [new_src_layout_str.index(dim) for dim in new_dst_layout_str] + dst_to_src_map = [new_src_layout_str.index(dim) for dim in new_dst_layout_str] block_read = sch.reindex_cache_read( block_write, read_buffer_index=0, index_map=tvm.tir.IndexMap.from_func( - lambda *loops: [loops[reindex_map[i]] for i, _ in enumerate(loops)], + lambda *loops: [loops[dst_to_src_map[i]] for i, _ in enumerate(loops)], ndim=len(new_src_layout_str), ), storage_scope="shared", ) - # While the above will have the shared memory buffer match the reshaped input tensor - # the loops still match those of the write/output loop/buffer. Match the src layout instead loops_read = sch.get_loops(block_read) - sch.reorder(*[loops_read[reindex_map[i]] for i, _ in enumerate(new_dst_layout_str)]) + sch.reorder( + *[loops_read[new_dst_layout_str.index(dst_dim_name)] for dst_dim_name in new_src_layout_str] + ) return block_read, unpack_list(input_shape), new_src_layout_str, new_dst_layout_str diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py index 35dd454b9195..881c959050b0 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py +++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py @@ -24,7 +24,6 @@ # 2. Correctness when running # 3. Autotuning ability - import itertools import random import tempfile @@ -38,8 +37,15 @@ from tvm import meta_schedule, relay from tvm.meta_schedule.schedule.cuda.layout_transform import cuda_layout_transform_schedule_rule from tvm.relay.op import OpPattern +from tvm.script import ir as I +from tvm.script import tir as T from tvm.tir.schedule import BlockRV, ExprRV, LoopRV +# Small gpu parameters which should work for nearly every (modern-ish) gpu. +TARGET = tvm.target.Target( + "cuda -max_threads_per_block=32 -max_num_threads=128 -thread_warp_size=32 -max_shared_memory_per_block=8192 -registers_per_block=1024" +) + class PatchCustomLayoutTransformScheduleRule: """Patch the custom layout transform schedule to test only specific tile sizes. @@ -327,19 +333,157 @@ def test_all_test_case( num_additional_ops, ): """Tests the product of all conditions `repeat_per_condition` times.""" - # Small gpu parameters which should work for nearly every (modern-ish) gpu. - target = tvm.target.Target( - "cuda -max_threads_per_block=32 -max_num_threads=128 -thread_warp_size=32 -max_shared_memory_per_block=8192 -registers_per_block=1024" - ) - # Generate random module of fusable ops + layout transform and extract fused layout transform task full_mod = self.generate_test_case( input_shape, implicit_reshape_info, dtype, num_additional_ops ) # Fused layout transform task - extracted_task = self.extract_layout_transform_task(full_mod, target) - self.verify_layout_transform_task(extracted_task, target, tile_sizes) + extracted_task = self.extract_layout_transform_task(full_mod, TARGET) + self.verify_layout_transform_task(extracted_task, TARGET, tile_sizes) + + +@tvm.testing.requires_gpu +class TestManualCases: + def assert_extracted_equals_expected( + self, relay_mod: tvm.IRModule, expected_mod: tvm.IRModule, tile_size: int + ): + extracted_task = TestRandomRelayE2ECorrectness.extract_layout_transform_task( + relay_mod, TARGET + ) + dispatched_mod = extracted_task.dispatched[0] + sch = tvm.tir.Schedule(dispatched_mod) + block = sch.get_block("T_layout_trans") + output_sch = cuda_layout_transform_schedule_rule(sch, block, [tile_size])[0] + assert output_sch.mod.script() == expected_mod.script() + + def ntest_simple_tiling(self): + mod = TestRandomRelayE2ECorrectness.create_relay_module( + [1, 32, 32, 32], "float16", [("NCHW", "NHWC")] + ) + + # Main things to notice: + # - two blocks each with 16, 16 extents which write/read shared mem + # - coalesced accesses in inner loop of global memory buffer for both + # fmt: off + @I.ir_module + class ExpectedModule: + @T.prim_func + def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "float16"), T_layout_trans: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "float16")): + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # with T.block("root"): + p0_shared = T.alloc_buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "float16", scope="shared") + for ax0_ax2_ax1_0_ax3_0_fused in T.thread_binding(T.int64(128), thread="blockIdx.x"): + for ax3_1_fused_0_ax3_1_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.x"): + for ax1_1_fused_0_ax1_1_fused_1_fused in range(T.int64(16)): + with T.block("p0_shared"): + v0 = T.axis.spatial(T.int64(1), T.int64(0)) + v1 = T.axis.spatial(T.int64(32), ax0_ax2_ax1_0_ax3_0_fused % T.int64(4) // T.int64(2) * T.int64(16) + ax1_1_fused_0_ax1_1_fused_1_fused) + v2 = T.axis.spatial(T.int64(32), ax0_ax2_ax1_0_ax3_0_fused // T.int64(4)) + v3 = T.axis.spatial(T.int64(32), ax0_ax2_ax1_0_ax3_0_fused % T.int64(2) * T.int64(16) + ax3_1_fused_0_ax3_1_fused_1_fused) + T.reads(p0[v0, v1, v2, v3]) + T.writes(p0_shared[v0, v1, v2, v3]) + p0_shared[v0, v1, v2, v3] = p0[v0, v1, v2, v3] + for ax0_ax1_fused_0 in range(T.int64(16)): + for ax0_ax1_fused_1 in T.thread_binding(T.int64(16), thread="threadIdx.x"): + with T.block("T_layout_trans"): + v_ax0 = T.axis.spatial(T.int64(1), T.int64(0)) + v_ax1 = T.axis.spatial(T.int64(32), ax0_ax2_ax1_0_ax3_0_fused // T.int64(4)) + v_ax2 = T.axis.spatial(T.int64(32), ax0_ax2_ax1_0_ax3_0_fused % T.int64(2) * T.int64(16) + (ax0_ax1_fused_0 * T.int64(16) + ax0_ax1_fused_1) // T.int64(16)) + v_ax3 = T.axis.spatial(T.int64(32), ax0_ax2_ax1_0_ax3_0_fused % T.int64(4) // T.int64(2) * T.int64(16) + (ax0_ax1_fused_0 * T.int64(16) + ax0_ax1_fused_1) % T.int64(16)) + T.reads(p0_shared[v_ax0, v_ax3, v_ax1, v_ax2]) + T.writes(T_layout_trans[v_ax0, v_ax1, v_ax2, v_ax3]) + T.block_attr({"dst_layout": "NHWC", "input_shape": [1, 32, 32, 32], "schedule_rule": "layout_transform", "src_layout": "NCHW"}) + T_layout_trans[v_ax0, v_ax1, v_ax2, v_ax3] = T.if_then_else(v_ax0 < T.int64(1) and v_ax3 < T.int64(32) and v_ax1 < T.int64(32) and v_ax2 < T.int64(32), p0_shared[v_ax0, v_ax3, v_ax1, v_ax2], T.float16(0)) + + self.assert_extracted_equals_expected(mod, ExpectedModule, 16) + + def test_simple_implicit_reshape(self): + mod = TestRandomRelayE2ECorrectness.create_relay_module( + [1, 32, 32, 32], "float16", [("NCHW", "NCHW4c")] + ) + + # Main things to notice: + # - two blocks each with 16, 16 extents which write/read shared mem + # - coalesced accesses in inner loop of global memory buffer for both + # - an implicit reshape is done (see p0_shared) + # fmt: off + @I.ir_module + class ExpectedModule: + @T.prim_func + def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "float16"), T_layout_trans: T.Buffer((T.int64(1), T.int64(8), T.int64(32), T.int64(32), T.int64(4)), "float16")): + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # with T.block("root"): + p0_shared = T.alloc_buffer((T.int64(1), T.int64(8), T.int64(32), T.int64(4), T.int64(32)), "float16", scope="shared") + for ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused in T.thread_binding(T.int64(128), thread="blockIdx.x"): + for ax3_1_fused_0_ax3_1_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.x"): + for ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused in range(T.int64(16)): + with T.block("p0_shared"): + v_ax0 = T.axis.spatial(T.int64(1), T.int64(0)) + v_ax1 = T.axis.spatial(T.int64(8), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused // T.int64(16)) + v_ax2 = T.axis.spatial(T.int64(32), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused % T.int64(16) * T.int64(2) + ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused // T.int64(8)) + v_ax3 = T.axis.spatial(T.int64(32), ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused % T.int64(8) // T.int64(4) * T.int64(16) + ax3_1_fused_0_ax3_1_fused_1_fused) + v_ax4 = T.axis.spatial(T.int64(4), ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused % T.int64(4)) + T.reads(p0[v_ax0, v_ax1 * T.int64(4) + v_ax4, v_ax2, v_ax3]) + T.writes(p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2]) + p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2] = p0[v_ax0, v_ax1 * T.int64(4) + v_ax4, v_ax2, v_ax3] + for ax0_ax1_ax2_fused_0 in range(T.int64(16)): + for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(16), thread="threadIdx.x"): + with T.block("T_layout_trans"): + v_ax0 = T.axis.spatial(T.int64(1), T.int64(0)) + v_ax1 = T.axis.spatial(T.int64(8), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused // T.int64(16)) + v_ax2 = T.axis.spatial(T.int64(32), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused % T.int64(16) * T.int64(2) + (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) // T.int64(128)) + v_ax3 = T.axis.spatial(T.int64(32), (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) % T.int64(128) // T.int64(4)) + v_ax4 = T.axis.spatial(T.int64(4), (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) % T.int64(4)) + T.reads(p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2]) + T.writes(T_layout_trans[v_ax0, v_ax1, v_ax2, v_ax3, v_ax4]) + T.block_attr({"dst_layout": "NCHW4c", "input_shape": [1, 32, 32, 32], "schedule_rule": "layout_transform", "src_layout": "NCHW"}) + T_layout_trans[v_ax0, v_ax1, v_ax2, v_ax3, v_ax4] = T.if_then_else(v_ax0 < T.int64(1) and v_ax1 * T.int64(4) + v_ax4 < T.int64(32) and v_ax2 < T.int64(32) and v_ax3 < T.int64(32), p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2], T.float16(0)) + self.assert_extracted_equals_expected(mod, ExpectedModule, 16) + + def test_expected_fusion_post(self): + mod = TestRandomRelayE2ECorrectness.create_relay_module( + [1, 32, 32, 32], "float16", [("NCHW", "NCHW4c"), OpPattern.BROADCAST] + ) + + # Main things to notice: + # - two blocks each with 16, 16 extents which write/read shared mem + # - coalesced accesses in inner loop of global memory buffer for both + # - an implicit reshape is done (see p0_shared) + # - an addition is inlined in the final block (p1 input) + # fmt: off + @I.ir_module + class ExpectedModule: + @T.prim_func + def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "float16"), p1: T.Buffer((), "float16"), T_add: T.Buffer((T.int64(1), T.int64(8), T.int64(32), T.int64(32), T.int64(4)), "float16")): + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # with T.block("root"): + p0_shared = T.alloc_buffer((T.int64(1), T.int64(8), T.int64(32), T.int64(4), T.int64(32)), "float16", scope="shared") + for ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused in T.thread_binding(T.int64(128), thread="blockIdx.x"): + for ax3_1_fused_0_ax3_1_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.x"): + for ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused in range(T.int64(16)): + with T.block("p0_shared"): + v_ax0 = T.axis.spatial(T.int64(1), T.int64(0)) + v_ax1 = T.axis.spatial(T.int64(8), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused // T.int64(16)) + v_ax2 = T.axis.spatial(T.int64(32), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused % T.int64(16) * T.int64(2) + ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused // T.int64(8)) + v_ax3 = T.axis.spatial(T.int64(32), ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused % T.int64(8) // T.int64(4) * T.int64(16) + ax3_1_fused_0_ax3_1_fused_1_fused) + v_ax4 = T.axis.spatial(T.int64(4), ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused % T.int64(4)) + T.reads(p0[v_ax0, v_ax1 * T.int64(4) + v_ax4, v_ax2, v_ax3]) + T.writes(p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2]) + p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2] = p0[v_ax0, v_ax1 * T.int64(4) + v_ax4, v_ax2, v_ax3] + for ax0_ax1_ax2_fused_0 in range(T.int64(16)): + for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(16), thread="threadIdx.x"): + with T.block("T_layout_trans"): + v_ax0 = T.axis.spatial(T.int64(1), T.int64(0)) + v_ax1 = T.axis.spatial(T.int64(8), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused // T.int64(16)) + v_ax2 = T.axis.spatial(T.int64(32), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused % T.int64(16) * T.int64(2) + (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) // T.int64(128)) + v_ax3 = T.axis.spatial(T.int64(32), (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) % T.int64(128) // T.int64(4)) + v_ax4 = T.axis.spatial(T.int64(4), (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) % T.int64(4)) + T.reads(p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2], p1[()]) + T.writes(T_add[v_ax0, v_ax1, v_ax2, v_ax3, v_ax4]) + T.block_attr({"dst_layout": "NCHW4c", "input_shape": [1, 32, 32, 32], "schedule_rule": "layout_transform", "src_layout": "NCHW"}) + T_add[v_ax0, v_ax1, v_ax2, v_ax3, v_ax4] = T.if_then_else(v_ax0 < T.int64(1) and v_ax1 * T.int64(4) + v_ax4 < T.int64(32) and v_ax2 < T.int64(32) and v_ax3 < T.int64(32), p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2], T.float16(0)) + p1[()] + self.assert_extracted_equals_expected(mod, ExpectedModule, 16) if __name__ == "__main__": From 3d092fe9a53150f269539bd951e4c95b5cd93b73 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Thu, 16 Mar 2023 12:27:48 -0700 Subject: [PATCH 31/40] manual tests --- .../test_meta_schedule_schedule_cuda_layout_transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py index 881c959050b0..827b9fcaa420 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py +++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py @@ -357,7 +357,7 @@ def assert_extracted_equals_expected( output_sch = cuda_layout_transform_schedule_rule(sch, block, [tile_size])[0] assert output_sch.mod.script() == expected_mod.script() - def ntest_simple_tiling(self): + def test_simple_tiling(self): mod = TestRandomRelayE2ECorrectness.create_relay_module( [1, 32, 32, 32], "float16", [("NCHW", "NHWC")] ) From 7d4df3f9ece6d310eaa5dd6d732079ff733a1ac7 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Thu, 16 Mar 2023 12:39:40 -0700 Subject: [PATCH 32/40] add comment, fix improper implicit reshape handling --- python/tvm/meta_schedule/schedule/cuda/layout_transform.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py index c51c8ed60f77..949ef915c9ff 100644 --- a/python/tvm/meta_schedule/schedule/cuda/layout_transform.py +++ b/python/tvm/meta_schedule/schedule/cuda/layout_transform.py @@ -352,6 +352,8 @@ def create_cached_read( The returned layout strings will be santized and made compatible. E.g. NCHW --> NCHW4c becomes NCcHW --> NCHWc. + TODO(AndrewZhaoLuo): Investigate using proper memory alignment to avoid bank conflict. + Parameters ---------- sch: @@ -426,7 +428,7 @@ def unpack_list(target_list) -> List: new_dst_layout_str = "".join(unpack_list(new_dst_layout)) # Write block loop extants match - dst_to_src_map = [new_src_layout_str.index(dim) for dim in new_dst_layout_str] + dst_to_src_map = [new_dst_layout_str.index(dim) for dim in new_src_layout_str] block_read = sch.reindex_cache_read( block_write, read_buffer_index=0, From 37a0e9d791557447de634972eac3642645fd2600 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Thu, 16 Mar 2023 12:39:47 -0700 Subject: [PATCH 33/40] fix --- ...schedule_schedule_cuda_layout_transform.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py index 827b9fcaa420..7000fbab1d0e 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py +++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py @@ -414,7 +414,7 @@ class ExpectedModule: def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "float16"), T_layout_trans: T.Buffer((T.int64(1), T.int64(8), T.int64(32), T.int64(32), T.int64(4)), "float16")): T.func_attr({"global_symbol": "main", "tir.noalias": True}) # with T.block("root"): - p0_shared = T.alloc_buffer((T.int64(1), T.int64(8), T.int64(32), T.int64(4), T.int64(32)), "float16", scope="shared") + p0_shared = T.alloc_buffer((T.int64(1), T.int64(8), T.int64(4), T.int64(32), T.int64(32)), "float16", scope="shared") for ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused in T.thread_binding(T.int64(128), thread="blockIdx.x"): for ax3_1_fused_0_ax3_1_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.x"): for ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused in range(T.int64(16)): @@ -425,8 +425,8 @@ def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "floa v_ax3 = T.axis.spatial(T.int64(32), ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused % T.int64(8) // T.int64(4) * T.int64(16) + ax3_1_fused_0_ax3_1_fused_1_fused) v_ax4 = T.axis.spatial(T.int64(4), ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused % T.int64(4)) T.reads(p0[v_ax0, v_ax1 * T.int64(4) + v_ax4, v_ax2, v_ax3]) - T.writes(p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2]) - p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2] = p0[v_ax0, v_ax1 * T.int64(4) + v_ax4, v_ax2, v_ax3] + T.writes(p0_shared[v_ax0, v_ax1, v_ax4, v_ax2, v_ax3]) + p0_shared[v_ax0, v_ax1, v_ax4, v_ax2, v_ax3] = p0[v_ax0, v_ax1 * T.int64(4) + v_ax4, v_ax2, v_ax3] for ax0_ax1_ax2_fused_0 in range(T.int64(16)): for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(16), thread="threadIdx.x"): with T.block("T_layout_trans"): @@ -435,10 +435,10 @@ def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "floa v_ax2 = T.axis.spatial(T.int64(32), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused % T.int64(16) * T.int64(2) + (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) // T.int64(128)) v_ax3 = T.axis.spatial(T.int64(32), (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) % T.int64(128) // T.int64(4)) v_ax4 = T.axis.spatial(T.int64(4), (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) % T.int64(4)) - T.reads(p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2]) + T.reads(p0_shared[v_ax0, v_ax1, v_ax4, v_ax2, v_ax3]) T.writes(T_layout_trans[v_ax0, v_ax1, v_ax2, v_ax3, v_ax4]) T.block_attr({"dst_layout": "NCHW4c", "input_shape": [1, 32, 32, 32], "schedule_rule": "layout_transform", "src_layout": "NCHW"}) - T_layout_trans[v_ax0, v_ax1, v_ax2, v_ax3, v_ax4] = T.if_then_else(v_ax0 < T.int64(1) and v_ax1 * T.int64(4) + v_ax4 < T.int64(32) and v_ax2 < T.int64(32) and v_ax3 < T.int64(32), p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2], T.float16(0)) + T_layout_trans[v_ax0, v_ax1, v_ax2, v_ax3, v_ax4] = T.if_then_else(v_ax0 < T.int64(1) and v_ax1 * T.int64(4) + v_ax4 < T.int64(32) and v_ax2 < T.int64(32) and v_ax3 < T.int64(32), p0_shared[v_ax0, v_ax1, v_ax4, v_ax2, v_ax3], T.float16(0)) self.assert_extracted_equals_expected(mod, ExpectedModule, 16) def test_expected_fusion_post(self): @@ -458,7 +458,7 @@ class ExpectedModule: def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "float16"), p1: T.Buffer((), "float16"), T_add: T.Buffer((T.int64(1), T.int64(8), T.int64(32), T.int64(32), T.int64(4)), "float16")): T.func_attr({"global_symbol": "main", "tir.noalias": True}) # with T.block("root"): - p0_shared = T.alloc_buffer((T.int64(1), T.int64(8), T.int64(32), T.int64(4), T.int64(32)), "float16", scope="shared") + p0_shared = T.alloc_buffer((T.int64(1), T.int64(8), T.int64(4), T.int64(32), T.int64(32)), "float16", scope="shared") for ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused in T.thread_binding(T.int64(128), thread="blockIdx.x"): for ax3_1_fused_0_ax3_1_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.x"): for ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused in range(T.int64(16)): @@ -469,8 +469,8 @@ def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "floa v_ax3 = T.axis.spatial(T.int64(32), ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused % T.int64(8) // T.int64(4) * T.int64(16) + ax3_1_fused_0_ax3_1_fused_1_fused) v_ax4 = T.axis.spatial(T.int64(4), ax2_1_ax3_0_1_ax4_1_fused_0_ax2_1_ax3_0_1_ax4_1_fused_1_fused % T.int64(4)) T.reads(p0[v_ax0, v_ax1 * T.int64(4) + v_ax4, v_ax2, v_ax3]) - T.writes(p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2]) - p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2] = p0[v_ax0, v_ax1 * T.int64(4) + v_ax4, v_ax2, v_ax3] + T.writes(p0_shared[v_ax0, v_ax1, v_ax4, v_ax2, v_ax3]) + p0_shared[v_ax0, v_ax1, v_ax4, v_ax2, v_ax3] = p0[v_ax0, v_ax1 * T.int64(4) + v_ax4, v_ax2, v_ax3] for ax0_ax1_ax2_fused_0 in range(T.int64(16)): for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(16), thread="threadIdx.x"): with T.block("T_layout_trans"): @@ -479,10 +479,10 @@ def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "floa v_ax2 = T.axis.spatial(T.int64(32), ax0_ax1_ax2_0_ax4_0_ax3_0_0_fused % T.int64(16) * T.int64(2) + (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) // T.int64(128)) v_ax3 = T.axis.spatial(T.int64(32), (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) % T.int64(128) // T.int64(4)) v_ax4 = T.axis.spatial(T.int64(4), (ax0_ax1_ax2_fused_0 * T.int64(16) + ax0_ax1_ax2_fused_1) % T.int64(4)) - T.reads(p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2], p1[()]) + T.reads(p0_shared[v_ax0, v_ax1, v_ax4, v_ax2, v_ax3], p1[()]) T.writes(T_add[v_ax0, v_ax1, v_ax2, v_ax3, v_ax4]) T.block_attr({"dst_layout": "NCHW4c", "input_shape": [1, 32, 32, 32], "schedule_rule": "layout_transform", "src_layout": "NCHW"}) - T_add[v_ax0, v_ax1, v_ax2, v_ax3, v_ax4] = T.if_then_else(v_ax0 < T.int64(1) and v_ax1 * T.int64(4) + v_ax4 < T.int64(32) and v_ax2 < T.int64(32) and v_ax3 < T.int64(32), p0_shared[v_ax0, v_ax1, v_ax3, v_ax4, v_ax2], T.float16(0)) + p1[()] + T_add[v_ax0, v_ax1, v_ax2, v_ax3, v_ax4] = T.if_then_else(v_ax0 < T.int64(1) and v_ax1 * T.int64(4) + v_ax4 < T.int64(32) and v_ax2 < T.int64(32) and v_ax3 < T.int64(32), p0_shared[v_ax0, v_ax1, v_ax4, v_ax2, v_ax3], T.float16(0)) + p1[()] self.assert_extracted_equals_expected(mod, ExpectedModule, 16) From 0687e572ced9f1c61161881bb2bb116b9c0b1ea7 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Thu, 16 Mar 2023 12:40:19 -0700 Subject: [PATCH 34/40] remove extra comments --- .../test_meta_schedule_schedule_cuda_layout_transform.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py index 7000fbab1d0e..a0d76ef1745f 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py +++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py @@ -15,15 +15,6 @@ # specific language governing permissions and limitations # under the License. -# Edge Cases: -# 1. Fusion with ops -# 2. Fusion with ops - -# Properties to test for -# 1. Compiling -- compiles well without crashing -# 2. Correctness when running -# 3. Autotuning ability - import itertools import random import tempfile From 8945399c124a61ad8aa6388bfa74579324165d2a Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Thu, 16 Mar 2023 12:51:24 -0700 Subject: [PATCH 35/40] more lints --- .../test_meta_schedule_schedule_cuda_layout_transform.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py index a0d76ef1745f..3f5fba8e6296 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py +++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py @@ -21,7 +21,6 @@ from typing import Callable, Dict, List, Optional, Tuple, Union import numpy as np -import pytest import tvm import tvm.testing @@ -30,7 +29,7 @@ from tvm.relay.op import OpPattern from tvm.script import ir as I from tvm.script import tir as T -from tvm.tir.schedule import BlockRV, ExprRV, LoopRV +from tvm.tir.schedule import BlockRV # Small gpu parameters which should work for nearly every (modern-ish) gpu. TARGET = tvm.target.Target( From ceb75488da4eb6d2d65c8b306823ba9e1a1b2eac Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Thu, 16 Mar 2023 13:22:02 -0700 Subject: [PATCH 36/40] refactor --- ...schedule_schedule_cuda_layout_transform.py | 230 ++++++++---------- 1 file changed, 107 insertions(+), 123 deletions(-) diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py index 3f5fba8e6296..ec93103f8209 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py +++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py @@ -66,91 +66,116 @@ def __exit__(self, *args, **kwargs) -> None: tvm.register_func(self.FUNC_NAME, self.old_func, override=True) -class TestRandomRelayE2ECorrectness: - """Tests E2E correctness of layout transform schedule. +# Create unary functions which apply ops with compatible fusion levels to layout transform +def get_random_axis(data: relay.Expr): + rank = len(relay.transform.InferTypeLocal(data).shape) + return random.randint(0, rank - 1) - Randomly generates relay mod with layout transform and fusable ops. Checks the - layout transform task for correctness by comparing against its unscheduled result. - """ - # Create unary functions which apply ops with compatible fusion levels to layout transform - @staticmethod - def get_random_axis(data: relay.Expr): - rank = len(relay.transform.InferTypeLocal(data).shape) - return random.randint(0, rank - 1) +def apply_elemwise_clip(data: relay.Expr, min=0, max=10): + assert relay.op.get("clip").get_attr("TOpPattern") == OpPattern.ELEMWISE + return relay.clip(data, min, max) - @staticmethod - def apply_elemwise_clip(data: relay.Expr, min=0, max=10): - assert relay.op.get("clip").get_attr("TOpPattern") == OpPattern.ELEMWISE - return relay.clip(data, min, max) - @staticmethod - def apply_broadcast_add(data: relay.Expr, val_to_add=5): - assert relay.op.get("add").get_attr("TOpPattern") == OpPattern.BROADCAST - type_info = relay.transform.InferTypeLocal(data) - return relay.add(data, relay.const(val_to_add, dtype=type_info.dtype)) +def apply_broadcast_add(data: relay.Expr, val_to_add=5): + assert relay.op.get("add").get_attr("TOpPattern") == OpPattern.BROADCAST + type_info = relay.transform.InferTypeLocal(data) + return relay.add(data, relay.const(val_to_add, dtype=type_info.dtype)) - @staticmethod - def apply_injective_concatenate(data: relay.Expr, axis=None): - if axis is None: - axis = TestRandomRelayE2ECorrectness.get_random_axis(data) - assert relay.op.get("concatenate").get_attr("TOpPattern") == OpPattern.INJECTIVE - return relay.concatenate([data, data], axis) - @staticmethod - def apply_comm_reduce_max(data: relay.Expr, axis=None): - if axis is None: - axis = TestRandomRelayE2ECorrectness.get_random_axis(data) - assert relay.op.get("max").get_attr("TOpPattern") == OpPattern.COMM_REDUCE +def apply_injective_concatenate(data: relay.Expr, axis=None): + if axis is None: + axis = get_random_axis(data) + assert relay.op.get("concatenate").get_attr("TOpPattern") == OpPattern.INJECTIVE + return relay.concatenate([data, data], axis) - # Do this to maintain dimensions - return relay.add(data, relay.max(data, axis, keepdims=True)) - @staticmethod - def get_map_pattern_level_to_op() -> Dict[OpPattern, Callable]: - # These are the only levels of op which can possibly be fused with layout_transform (which injective) - return { - OpPattern.ELEMWISE: TestRandomRelayE2ECorrectness.apply_elemwise_clip, - OpPattern.BROADCAST: TestRandomRelayE2ECorrectness.apply_broadcast_add, - OpPattern.INJECTIVE: TestRandomRelayE2ECorrectness.apply_injective_concatenate, - OpPattern.COMM_REDUCE: TestRandomRelayE2ECorrectness.apply_comm_reduce_max, - } +def apply_comm_reduce_max(data: relay.Expr, axis=None): + if axis is None: + axis = get_random_axis(data) + assert relay.op.get("max").get_attr("TOpPattern") == OpPattern.COMM_REDUCE - @staticmethod - def apply_layout_transform(data: relay.Expr, src_layout: str, dst_layout: str): - assert relay.op.get("layout_transform").get_attr("TOpPattern") == OpPattern.INJECTIVE - return relay.layout_transform(data, src_layout, dst_layout) + # Do this to maintain dimensions + return relay.add(data, relay.max(data, axis, keepdims=True)) - @staticmethod - def create_relay_module( - input_shape: List[int], dtype: str, ops: List[Union[OpPattern, Tuple[str, str]]] - ) -> tvm.IRModule: - """Create a relay module with the given string of ops. - - ops: - Applies the associated operators in order. If an integer, refers to applying - the unary operator from `extra_pattern_level_to_op` map. If a tuple, applies - a layout transform with the given (src_layout, dst_layout) - """ - input_data = relay.var("input", shape=input_shape, dtype=dtype) - - cur_data = input_data - for op_info in ops: - # Progressively build type info - relay.transform.InferTypeLocal(cur_data) - if isinstance(op_info, tuple): - # layout transform case - src_layout, dst_layout = op_info - cur_data = TestRandomRelayE2ECorrectness.apply_layout_transform( - cur_data, src_layout, dst_layout - ) - else: - cur_data = TestRandomRelayE2ECorrectness.get_map_pattern_level_to_op()[op_info]( - cur_data - ) +pattern_level_to_op = { + OpPattern.ELEMWISE: apply_elemwise_clip, + OpPattern.BROADCAST: apply_broadcast_add, + OpPattern.INJECTIVE: apply_injective_concatenate, + OpPattern.COMM_REDUCE: apply_comm_reduce_max, +} + + +def apply_layout_transform(data: relay.Expr, src_layout: str, dst_layout: str): + assert relay.op.get("layout_transform").get_attr("TOpPattern") == OpPattern.INJECTIVE + return relay.layout_transform(data, src_layout, dst_layout) + + +def create_relay_module( + input_shape: List[int], dtype: str, ops: List[Union[OpPattern, Tuple[str, str]]] +) -> tvm.IRModule: + """Create a relay module with the given string of ops. + + ops: + Applies the associated operators in order. If an integer, refers to applying + the unary operator from `extra_pattern_level_to_op` map. If a tuple, applies + a layout transform with the given (src_layout, dst_layout) + """ + input_data = relay.var("input", shape=input_shape, dtype=dtype) + + cur_data = input_data + for op_info in ops: + # Progressively build type info relay.transform.InferTypeLocal(cur_data) - return tvm.IRModule.from_expr(cur_data) + if isinstance(op_info, tuple): + # layout transform case + src_layout, dst_layout = op_info + cur_data = apply_layout_transform(cur_data, src_layout, dst_layout) + else: + cur_data = pattern_level_to_op[op_info](cur_data) + + relay.transform.InferTypeLocal(cur_data) + return tvm.IRModule.from_expr(cur_data) + + +def extract_layout_transform_task( + mod: tvm.IRModule, target: tvm.target.Target +) -> meta_schedule.ExtractedTask: + """Given a relay IRModule, return the PrimFunc IRModule with fused layout transform task.""" + extracted_tasks = meta_schedule.relay_integration.extract_tasks( + mod, + target, + {}, + pass_config={"relay.backend.use_meta_schedule": True}, + ) + task_of_interest = None + for task in extracted_tasks: + if "layout_transform" in task.task_name: + task_of_interest = task + break + assert task_of_interest is not None + return task_of_interest + + +def run_primfunc( + primfunc_mod: tvm.IRModule, target: tvm.target.Target, input_tensors: List[tvm.nd.NDArray] +): + """Compile and run the primfunc with the given input tensors.""" + with tvm.transform.PassContext( + config={"relay.backend.use_meta_schedule": True}, + opt_level=3, + ): + lib = tvm.build(primfunc_mod, target=target) + lib(*input_tensors) + + +class TestRandomRelayE2ECorrectness: + """Tests E2E correctness of layout transform schedule. + + Randomly generates relay mod with layout transform and fusable ops. Checks the + layout transform task for correctness by comparing against its unscheduled result. + """ @staticmethod def generate_test_case( @@ -181,7 +206,7 @@ def generate_test_case( # Randomly sample a list of potentially fusable ops to layout transform op_order = random.choices( - list(TestRandomRelayE2ECorrectness.get_map_pattern_level_to_op().keys()), + list(pattern_level_to_op.keys()), k=num_additional_ops, ) @@ -189,38 +214,7 @@ def generate_test_case( op_order.append((src_layout, dst_layout)) random.shuffle(op_order) - return TestRandomRelayE2ECorrectness.create_relay_module(input_shape, dtype, op_order) - - @staticmethod - def extract_layout_transform_task( - mod: tvm.IRModule, target: tvm.target.Target - ) -> meta_schedule.ExtractedTask: - """Given a relay IRModule, return the PrimFunc IRModule with fused layout transform task.""" - extracted_tasks = meta_schedule.relay_integration.extract_tasks( - mod, - target, - {}, - pass_config={"relay.backend.use_meta_schedule": True}, - ) - task_of_interest = None - for task in extracted_tasks: - if "layout_transform" in task.task_name: - task_of_interest = task - break - assert task_of_interest is not None - return task_of_interest - - @staticmethod - def run_primfunc( - primfunc_mod: tvm.IRModule, target: tvm.target.Target, input_tensors: List[tvm.nd.NDArray] - ): - """Compile and run the primfunc with the given input tensors.""" - with tvm.transform.PassContext( - config={"relay.backend.use_meta_schedule": True}, - opt_level=3, - ): - lib = tvm.build(primfunc_mod, target=target) - lib(*input_tensors) + return create_relay_module(input_shape, dtype, op_order) @staticmethod def get_primfunc(extracted_task: meta_schedule.ExtractedTask, tile_size: Optional[int]): @@ -270,13 +264,9 @@ def get_output_tensor() -> Tuple[tvm.nd.NDArray, tvm.nd.NDArray]: return tvm.nd.array(numpy_init, device) def run_and_get_output(tile_size: Optional[int]) -> np.ndarray: - returned_primfunc = TestRandomRelayE2ECorrectness.get_primfunc( - extracted_task, tile_size - ) + returned_primfunc = TestRandomRelayE2ECorrectness.get_primfunc(extracted_task, tile_size) output_tensor = get_output_tensor() - TestRandomRelayE2ECorrectness.run_primfunc( - returned_primfunc, target, [*input_tensors, output_tensor] - ) + run_primfunc(returned_primfunc, target, [*input_tensors, output_tensor]) return output_tensor.numpy() # Passing None, we basically do not apply the custom rule we have created @@ -329,7 +319,7 @@ def test_all_test_case( ) # Fused layout transform task - extracted_task = self.extract_layout_transform_task(full_mod, TARGET) + extracted_task = extract_layout_transform_task(full_mod, TARGET) self.verify_layout_transform_task(extracted_task, TARGET, tile_sizes) @@ -338,9 +328,7 @@ class TestManualCases: def assert_extracted_equals_expected( self, relay_mod: tvm.IRModule, expected_mod: tvm.IRModule, tile_size: int ): - extracted_task = TestRandomRelayE2ECorrectness.extract_layout_transform_task( - relay_mod, TARGET - ) + extracted_task = extract_layout_transform_task(relay_mod, TARGET) dispatched_mod = extracted_task.dispatched[0] sch = tvm.tir.Schedule(dispatched_mod) block = sch.get_block("T_layout_trans") @@ -348,9 +336,7 @@ def assert_extracted_equals_expected( assert output_sch.mod.script() == expected_mod.script() def test_simple_tiling(self): - mod = TestRandomRelayE2ECorrectness.create_relay_module( - [1, 32, 32, 32], "float16", [("NCHW", "NHWC")] - ) + mod = create_relay_module([1, 32, 32, 32], "float16", [("NCHW", "NHWC")]) # Main things to notice: # - two blocks each with 16, 16 extents which write/read shared mem @@ -389,9 +375,7 @@ def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "floa self.assert_extracted_equals_expected(mod, ExpectedModule, 16) def test_simple_implicit_reshape(self): - mod = TestRandomRelayE2ECorrectness.create_relay_module( - [1, 32, 32, 32], "float16", [("NCHW", "NCHW4c")] - ) + mod = create_relay_module([1, 32, 32, 32], "float16", [("NCHW", "NCHW4c")]) # Main things to notice: # - two blocks each with 16, 16 extents which write/read shared mem @@ -432,7 +416,7 @@ def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "floa self.assert_extracted_equals_expected(mod, ExpectedModule, 16) def test_expected_fusion_post(self): - mod = TestRandomRelayE2ECorrectness.create_relay_module( + mod = create_relay_module( [1, 32, 32, 32], "float16", [("NCHW", "NCHW4c"), OpPattern.BROADCAST] ) From 7fdcb690b94634112a9681a3a09b6be0293c3d9c Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Mon, 20 Mar 2023 10:56:29 -0700 Subject: [PATCH 37/40] remove extraneous check --- .../postproc/rewrite_cooperative_fetch.cc | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc index 8cdb6a1e92a6..19682e6c614a 100644 --- a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc +++ b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc @@ -40,16 +40,7 @@ Optional ParseThreadBinding(const Schedule& sch, const Instruction& ins return NullOpt; } - try { - return Downcast(sch->Get(Downcast(inst->inputs[0]))->extent); - } catch (const std::exception& e) { - // This can occur if in a schedule we manually bind threads in the middle of a schedule - // and then later modify the schedule. As the passed in schedule is after running the entire - // trace the bound loop may be moved around in the IRModule. - LOG(DEBUG) << "Failed to calculate extent so skipping RewriteCooperativeFetching. Error " - << e.what(); - return NullOpt; - } + return Downcast(sch->Get(Downcast(inst->inputs[0]))->extent); } /*! From a5d8f5f50161e27af83031c48e020f3aee0d8a08 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Tue, 21 Mar 2023 15:25:46 -0700 Subject: [PATCH 38/40] lint again :/ --- .../test_meta_schedule_schedule_cuda_layout_transform.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py index ec93103f8209..06b6514ee3aa 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py +++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py @@ -264,7 +264,9 @@ def get_output_tensor() -> Tuple[tvm.nd.NDArray, tvm.nd.NDArray]: return tvm.nd.array(numpy_init, device) def run_and_get_output(tile_size: Optional[int]) -> np.ndarray: - returned_primfunc = TestRandomRelayE2ECorrectness.get_primfunc(extracted_task, tile_size) + returned_primfunc = TestRandomRelayE2ECorrectness.get_primfunc( + extracted_task, tile_size + ) output_tensor = get_output_tensor() run_primfunc(returned_primfunc, target, [*input_tensors, output_tensor]) return output_tensor.numpy() From bf60774dfffb42ba0bac1695c4a26f19e7574c8d Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Tue, 21 Mar 2023 15:29:07 -0700 Subject: [PATCH 39/40] remove uneeded newline --- src/meta_schedule/postproc/rewrite_cooperative_fetch.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc index 19682e6c614a..353b90c36423 100644 --- a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc +++ b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc @@ -39,7 +39,6 @@ Optional ParseThreadBinding(const Schedule& sch, const Instruction& ins if (thread_axis != axis) { return NullOpt; } - return Downcast(sch->Get(Downcast(inst->inputs[0]))->extent); } From 2c1047a05f50421dc36888cfed3405bec03f4883 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Luo Date: Thu, 23 Mar 2023 09:54:59 -0700 Subject: [PATCH 40/40] remove leading spaces --- .../test_meta_schedule_schedule_cuda_layout_transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py index 06b6514ee3aa..d1ba84d836be 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py +++ b/tests/python/unittest/test_meta_schedule_schedule_cuda_layout_transform.py @@ -373,7 +373,7 @@ def main(p0: T.Buffer((T.int64(1), T.int64(32), T.int64(32), T.int64(32)), "floa T.writes(T_layout_trans[v_ax0, v_ax1, v_ax2, v_ax3]) T.block_attr({"dst_layout": "NHWC", "input_shape": [1, 32, 32, 32], "schedule_rule": "layout_transform", "src_layout": "NCHW"}) T_layout_trans[v_ax0, v_ax1, v_ax2, v_ax3] = T.if_then_else(v_ax0 < T.int64(1) and v_ax3 < T.int64(32) and v_ax1 < T.int64(32) and v_ax2 < T.int64(32), p0_shared[v_ax0, v_ax3, v_ax1, v_ax2], T.float16(0)) - + self.assert_extracted_equals_expected(mod, ExpectedModule, 16) def test_simple_implicit_reshape(self):