[Dev][AMD] Implement conditional async load for AMD HIP Backend (#250)

* test fix * submodule update * lint fix * update submodule
microsoft · Nov 28, 2024 · 645ccd7 · 645ccd7
1 parent 6895496
commit 645ccd7
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 9 deletions.
diff --git a/3rdparty/tvm b/3rdparty/tvm
diff --git a/benchmark/tilelang/benchmark_tilelang_matmul.py b/benchmark/tilelang/benchmark_tilelang_matmul.py
@@ -4,6 +4,12 @@
 from tvm.tl.autotuner import *
 import itertools
 
+import logging
+
+logger = logging.getLogger(__name__)
+
+logger.setLevel(logging.DEBUG)
+
 
 def ref_program(A, B):
     return A @ B.T
@@ -16,16 +22,20 @@ def get_configs():
     num_stages = [0, 1, 2, 3, 4]
     thread_num = [128, 256]
     enable_rasteration = [True, False]
+    k_pack = [1, 2]
+
     _configs = list(
-        itertools.product(block_M, block_N, block_K, num_stages, thread_num, enable_rasteration))
+        itertools.product(block_M, block_N, block_K, num_stages, thread_num, enable_rasteration,
+                          k_pack))
 
     configs = [{
         'block_M': c[0],
         'block_N': c[1],
         'block_K': c[2],
         'num_stages': c[3],
         'thread_num': c[4],
-        'enable_rasteration': c[5]
+        'enable_rasteration': c[5],
+        'k_pack': c[6]
     } for c in _configs]
     return configs
 
@@ -34,22 +44,26 @@ def matmul(M, N, K):
 
     @autotune(
         configs=get_configs(),
-        keys=['block_M', 'block_N', 'block_K', 'num_stages', 'thread_num'],
+        keys=[
+            'block_M', 'block_N', 'block_K', 'num_stages', 'thread_num', 'enable_rasteration',
+            'k_pack'
+        ],
         warmup=3,
         rep=5)
     @jit(
         out_idx=[2],
         supply_type=tl.TensorSupplyType.Integer,
         ref_prog=ref_program,
-        skip_check=True,
+        skip_check=False,
         profiler="tvm",
         target="hip")
     def kernel(block_M=None,
                block_N=None,
                block_K=None,
                num_stages=None,
                thread_num=None,
-               enable_rasteration=None):
+               enable_rasteration=None,
+               k_pack=None):
         dtype = "float16"
         accum_dtype = "float"
 
@@ -66,9 +80,9 @@ def main(A: T.Buffer((M, K), dtype), B: T.Buffer((N, K), dtype), C: T.Buffer((M,
 
                 T.clear(C_local)
                 for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                    T.gemm(A_shared, B_shared, C_local, transpose_B=True)
+                    T.copy(A[by * block_M, k * block_K], A_shared, coalesced_width=4 * k_pack)
+                    T.copy(B[bx * block_N, k * block_K], B_shared, coalesced_width=4 * k_pack)
+                    T.gemm(A_shared, B_shared, C_local, transpose_B=True, k_pack=k_pack)
                 T.copy(C_local, C[by * block_M, bx * block_N])
 
         return main