microsoft · LeiWang1999 · Nov 1, 2024 · Oct 4, 2024 · Nov 1, 2024 · Nov 1, 2024
diff --git a/.gitmodules b/.gitmodules
@@ -1,7 +1,7 @@
 [submodule "3rdparty/tvm"]
 	path = 3rdparty/tvm
 	url = https://github.com/TileLang/tvm.git
-	branch = tilelang
+	branch = upstream
 [submodule "3rdparty/cutlass"]
 	path = 3rdparty/cutlass
 	url = https://github.com/TileLang/cutlass

diff --git a/3rdparty/tvm b/3rdparty/tvm
diff --git a/bitblas/ops/base_scheduler.py b/bitblas/ops/base_scheduler.py
@@ -70,3 +70,13 @@ def common_header(self):
         # TODO(lei): For HIP Backend it should be different
         common_header = "#include <tl_templates/cuda/common.h>\n"
         return common_header
+
+
+# Decorator to simplify the output of a function
+def simplify_prim_func(func: Callable):
+
+    def wrapper(*args, **kwargs):
+        stmt: Union[PrimFunc, IRModule] = (func)(*args, **kwargs)
+        return BaseScheduler.Simplify(stmt)
+
+    return wrapper
diff --git a/bitblas/ops/general_matmul/tilelang/dense/matmul_tensorcore.py b/bitblas/ops/general_matmul/tilelang/dense/matmul_tensorcore.py
@@ -424,7 +424,9 @@ def apply_config(
         threads = warp_size * (block_row_warps * block_col_warps)
 
         # Calculate local fragment sizes for tensor core
-        local_size = (micro_size_x * micro_size_y) // warp_size
+        local_size_a = (micro_size_x * micro_size_k) // warp_size
+        local_size_b = (micro_size_y * micro_size_k) // warp_size
+        local_size_c = (micro_size_x * micro_size_y) // warp_size
         warp_rows = warp_row_tiles // micro_size_x
         warp_cols = warp_col_tiles // micro_size_y
 
@@ -459,9 +461,9 @@ def main(
                 A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
                 B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
                 C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
-                A_local = T.alloc_local((warp_rows * local_size), in_dtype)
-                B_local = T.alloc_local((warp_cols * local_size), in_dtype)
-                C_local = T.alloc_local((warp_rows * warp_cols * local_size), accum_dtype)
+                A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
+                B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
+                C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
                 # Thread-level parallelism for Tensor Cores
                 thread_bindings = T.thread_binding(0, threads, "threadIdx.x")

diff --git a/bitblas/ops/general_matmul/tilelang/dequantize/finegrained_primitive_tensorcore.py b/bitblas/ops/general_matmul/tilelang/dequantize/finegrained_primitive_tensorcore.py
@@ -231,7 +231,9 @@ def apply_config(
         block_K = chunk
         threads = warp_size * (block_row_warps * block_col_warps)
 
-        fragement_size = (micro_size_x * micro_size_y) // warp_size
+        fragement_size_a = (micro_size_x * micro_size_k) // warp_size
+        fragement_size_b = (micro_size_y * micro_size_k) // warp_size
+        fragement_size_c = (micro_size_x * micro_size_y) // warp_size
         warp_rows = warp_row_tiles // micro_size_x
         warp_cols = warp_col_tiles // micro_size_y
 
@@ -318,9 +320,9 @@ def general_dequant_matmul(
                 B_dequantize_shared = T.alloc_shared(B_dequantize_shared_shape, in_dtype)
                 C_shared = T.alloc_shared(C_shared_shape, out_dtype)
 
-                A_frag = T.alloc_local((warp_rows * fragement_size), in_dtype)
-                B_frag = T.alloc_local((warp_cols * fragement_size), in_dtype)
-                C_frag = T.alloc_local((warp_rows * warp_cols * fragement_size), accum_dtype)
+                A_frag = T.alloc_local((warp_rows * fragement_size_a), in_dtype)
+                B_frag = T.alloc_local((warp_cols * fragement_size_b), in_dtype)
+                C_frag = T.alloc_local((warp_rows * warp_cols * fragement_size_c), accum_dtype)
 
                 B_local = T.alloc_local([local_size_compressed], storage_dtype)
                 B_dequantize_local = T.alloc_local([local_size], in_dtype)

diff --git a/bitblas/ops/general_matmul/tilelang/dequantize/ladder_weight_transform_tensorcore.py b/bitblas/ops/general_matmul/tilelang/dequantize/ladder_weight_transform_tensorcore.py
@@ -71,7 +71,9 @@ def apply_config(
         block_K = chunk
         threads = warp_size * (block_row_warps * block_col_warps)
 
-        fragement_size = (micro_size_x * micro_size_y) // warp_size
+        fragement_size_a = (micro_size_x * micro_size_k) // warp_size
+        fragement_size_b = (micro_size_y * micro_size_k) // warp_size
+        fragement_size_c = (micro_size_x * micro_size_y) // warp_size
         warp_rows = warp_row_tiles // micro_size_x
         warp_cols = warp_col_tiles // micro_size_y
 
@@ -173,11 +175,11 @@ def general_dequant_matmul(
                 B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
                 C_shared = T.alloc_shared(C_shared_shape, out_dtype)
 
-                A_frag = T.alloc_local((warp_rows * fragement_size), in_dtype)
-                B_frag = T.alloc_local((warp_cols * fragement_size // num_elems_per_byte),
+                A_frag = T.alloc_local((warp_rows * fragement_size_a), in_dtype)
+                B_frag = T.alloc_local((warp_cols * fragement_size_b // num_elems_per_byte),
                                        storage_dtype)
-                B_dequantize_frag = T.alloc_local((warp_cols * fragement_size), in_dtype)
-                C_frag = T.alloc_local((warp_rows * warp_cols * fragement_size), accum_dtype)
+                B_dequantize_frag = T.alloc_local((warp_cols * fragement_size_b), in_dtype)
+                C_frag = T.alloc_local((warp_rows * warp_cols * fragement_size_c), accum_dtype)
 
                 tx = T.thread_binding(0, threads, thread="threadIdx.x")