[Dev][TL] Implement Tile Language Dequant Matmul and Test Case (#224)

* Refactor dequantize scheduler and simplify pass * Refactor dequantize scheduler and simplify pass * Implement Fast Decoding
microsoft · Oct 21, 2024 · 7ed01bf · 7ed01bf
1 parent 002e9a6
commit 7ed01bf
Show file tree

Hide file tree

Showing 7 changed files with 131 additions and 24 deletions.
diff --git a/3rdparty/tvm b/3rdparty/tvm
diff --git a/bitblas/gpu/intrin/lop3.py b/bitblas/gpu/intrin/lop3.py
@@ -1685,6 +1685,12 @@ def get_lop3_intrin_group(
         raise ValueError("Unsupported target dtype: {}".format(target_dtype))
     source_symbol = "u" if source_format == "uint" else "s"
     func_name = "decode_i{}{}_to_{}".format(source_bit, source_symbol, d4f)
+    if with_scaling:
+        func_name += "_scale"
+    if with_zeros:
+        func_name += f"_zeros_{zeros_mode}"
+    if is_ladder_stage3:
+        func_name += "_offset"
 
     return {
         "func_name": func_name,

diff --git a/bitblas/ops/base_scheduler.py b/bitblas/ops/base_scheduler.py
@@ -64,3 +64,9 @@ def apply_config(
         **kwargs,
     ):
         pass
+
+    @property
+    def common_header(self):
+        # TODO(lei): For HIP Backend it should be different
+        common_header = "#include <tl_templates/cuda/common.h>\n"
+        return common_header
diff --git a/bitblas/ops/general_matmul/tilelang/dequantize/block_primitive_tensorcore.py b/bitblas/ops/general_matmul/tilelang/dequantize/block_primitive_tensorcore.py
@@ -234,14 +234,17 @@ def apply_config(
         if fast_decoding is True:
             lop3_intrin_info = get_lop3_intrin_group(
                 out_dtype=out_dtype,
-                storage_dtype=storage_dtype,
                 source_format=source_format,
                 source_bit=num_bits,
+                storage_dtype=storage_dtype,
+                with_scaling=self.with_scaling,
+                with_zeros=self.with_zeros,
             )
             import_source = lop3_intrin_info["c_source"]
             func_name = lop3_intrin_info["func_name"]
             assert import_source is not None, "lop3_intrin_info is not found"
             assert func_name is not None, "lop3_intrin_info is not found"
+            import_source = self.common_header + import_source
 
         @T.prim_func
         def general_dequant_matmul(
@@ -286,11 +289,20 @@ def general_dequant_matmul(
                             B_local[v] = B_shared[vi, vj]
 
                         if fast_decoding is True:
-                            T.call_extern(
+                            self._normal_fast_dequant(
+                                B_local,
+                                B_dequantize_local,
+                                Scale,
+                                Zeros,
+                                Qzeros,
                                 func_name,
-                                T.address_of(B_local[0]),
-                                T.address_of(B_dequantize_local[0]),
-                                dtype=in_dtype,
+                                by,
+                                tx,
+                                k,
+                                i,
+                                block_N,
+                                block_K,
+                                threads,
                             )
                         else:
                             self._normal_dequant(
@@ -364,7 +376,6 @@ def naive_cast_dequant(x):
 
         return dequant_func
 
-    # proxy method for macro expansion
     def _normal_dequant(
         self,
         compressed_weight_local: T.Buffer,
@@ -401,25 +412,18 @@ def _normal_dequant_impl(
             zeros_buffer: T.Buffer,
             qzeros_buffer: T.Buffer,
         ):
-            print("Normal Dequantize")
-            print("with_scaling", with_scaling)
-            print("with_zeros", with_zeros)
-            print("zeros_mode", zeros_mode)
-            print("num_bits", num_bits)
             for v in T.serial(0, local_size):
                 index = (i * threads * local_size_compressed + tx * local_size_compressed + v)
                 vi = index // (stride_k // num_elems_per_byte)
                 vj = index % (stride_k // num_elems_per_byte)
                 if not with_scaling:
-                    print("No Scaling")
                     dequant_weight_local[v] = self._decode_func(
                         num_bits,
                         compressed_weight_local[v // num_elems_per_byte],
                         v % num_elems_per_byte,
                         dtype=in_dtype,
                     )
                 elif not with_zeros:
-                    print("No Zeros")
                     # Scaling only
                     dequant_weight_local[v] = (
                         self._decode_func(
@@ -429,7 +433,6 @@ def _normal_dequant_impl(
                             dtype=in_dtype,
                         ) * scale_buffer[pid_n * stride_n + vi, (k * stride_k + vj) // group_size])
                 elif zeros_mode == "original":
-                    print("Original Zeros")
                     dequant_weight_local[v] = (self._decode_func(
                         num_bits,
                         compressed_weight_local[v // num_elems_per_byte],
@@ -439,7 +442,6 @@ def _normal_dequant_impl(
                                      group_size]) * scale_buffer[pid_n * stride_n + vi,
                                                                  (k * stride_k + vj) // group_size]
                 elif zeros_mode == "rescale":
-                    print("rescale")
                     dequant_weight_local[v] = (
                         self._decode_func(
                             num_bits,
@@ -449,7 +451,6 @@ def _normal_dequant_impl(
                         ) * scale_buffer[pid_n * stride_n + vi, (k * stride_k + vj) // group_size] -
                         zeros_buffer[pid_n * stride_n + vi, (k * stride_k + vj) // group_size])
                 elif zeros_mode == "quantized":
-                    print("Quantized Zeros")
                     dequant_qzeros = _tir_packed_to_unsigned_convert(storage_type, storage_nbit)(
                         num_bits,
                         qzeros_buffer[
@@ -476,6 +477,81 @@ def _normal_dequant_impl(
             qzeros_buffer,
         )
 
+    def _normal_fast_dequant(
+        self,
+        compressed_weight_local: T.Buffer,
+        dequant_weight_local: T.Buffer,
+        scale_buffer: T.Buffer,
+        zeros_buffer: T.Buffer,
+        qzeros_buffer: T.Buffer,
+        func_name: str,
+        pid_n: T.Var,
+        tx: T.Var,
+        k: T.Var,
+        i: T.Var,
+        stride_n: int,
+        stride_k: int,
+        threads: int,
+    ):
+        num_elems_per_byte = self.num_elems_per_byte
+        with_scaling = self.with_scaling
+        with_zeros = self.with_zeros
+        zeros_mode = self.zeros_mode
+        in_dtype = self.in_dtype
+        group_size = self.group_size
+
+        @T.macro
+        def _normal_fast_dequant_impl(
+            compressed_weight_local: T.Buffer,
+            dequant_weight_local: T.Buffer,
+            scale_buffer: T.Buffer,
+            zeros_buffer: T.Buffer,
+            qzeros_buffer: T.Buffer,
+        ):
+            if not with_scaling:
+                T.call_extern(
+                    func_name,
+                    T.address_of(compressed_weight_local[0]),
+                    T.address_of(dequant_weight_local[0]),
+                    dtype=in_dtype,
+                )
+            elif not with_zeros:
+                T.call_extern(
+                    func_name,
+                    T.address_of(compressed_weight_local[0]),
+                    T.address_of(dequant_weight_local[0]),
+                    T.address_of(scale_buffer[pid_n * stride_n, k * stride_k // group_size]),
+                    dtype=in_dtype,
+                )
+            elif zeros_mode in ["original", "rescale"]:
+                T.call_extern(
+                    func_name,
+                    T.address_of(compressed_weight_local[0]),
+                    T.address_of(dequant_weight_local[0]),
+                    T.address_of(scale_buffer[pid_n * stride_n, k * stride_k // group_size]),
+                    T.address_of(zeros_buffer[pid_n * stride_n, k * stride_k // group_size]),
+                    dtype=in_dtype,
+                )
+            elif zeros_mode == "quantized":
+                T.call_extern(
+                    func_name,
+                    T.address_of(compressed_weight_local[0]),
+                    T.address_of(dequant_weight_local[0]),
+                    T.address_of(scale_buffer[pid_n * stride_n, k * stride_k // group_size]),
+                    T.address_of(zeros_buffer[pid_n * stride_n, k * stride_k // group_size]),
+                    T.address_of(qzeros_buffer[k * stride_k // group_size,
+                                               pid_n * stride_n // num_elems_per_byte]),
+                    dtype=in_dtype,
+                )
+
+        return _normal_fast_dequant_impl(
+            compressed_weight_local,
+            dequant_weight_local,
+            scale_buffer,
+            zeros_buffer,
+            qzeros_buffer,
+        )
+
     @property
     def num_elems_per_byte(self):
         storage_nbit = int("".join(c for c in self.storage_dtype if c.isdigit()))

diff --git a/testing/python/operators/test_general_matmul_tilelang_kernel.py b/testing/python/operators/test_general_matmul_tilelang_kernel.py
@@ -457,7 +457,7 @@ def assert_matmul_blocked_dequant_with_default_correctness(
         fast_decoding=fast_decoding,
         zeros_mode=zeros_mode,
     ).with_default_config()
-
+    print(matmul)
     mod, params = tl.lower(matmul)
     src_code = mod.imported_modules[0].get_source()
     # src_code is the generated cuda source
@@ -486,8 +486,6 @@ def assert_matmul_blocked_dequant_with_default_correctness(
     if with_zeros:
         inputs[1] = inputs[1] - zeros
 
-    ref_result = torch.matmul(inputs[0], inputs[1].t().to(torch.float16))
-
     permuted_inputs = []
     permuted_inputs.append(inputs[0])
     qw = general_compress(intweight.cpu().numpy(), source_bits=bit, storage_dtype=np.int8)
@@ -522,11 +520,14 @@ def assert_matmul_blocked_dequant_with_default_correctness(
     mod(*permuted_inputs)
 
     print(permuted_inputs[-1])
+
+    ref_result = torch.matmul(inputs[0], inputs[1].t().to(torch.float16))
+
     print(ref_result)
     if zeros_mode == "rescale":
-        torch.testing.assert_close(permuted_inputs[-1], ref_result, rtol=1e2, atol=1e0)
+        torch.testing.assert_close(permuted_inputs[-1], ref_result, rtol=1e2, atol=1e2)
     else:
-        torch.testing.assert_close(permuted_inputs[-1], ref_result, rtol=1e2, atol=1e0)
+        torch.testing.assert_close(permuted_inputs[-1], ref_result, rtol=1e2, atol=1e2)
 
 
 def test_matmul_blocked():
@@ -565,6 +566,23 @@ def test_matmul_blocked_dequant_with_default():
         1024, 1024, 1024, source_format="uint", bit=4)
     assert_matmul_blocked_dequant_with_default_correctness(
         1024, 1024, 1024, source_format="uint", bit=2)
+    assert_matmul_blocked_dequant_with_default_correctness(
+        1024, 1024, 1024, source_format="uint", bit=4, with_scaling=True)
+    assert_matmul_blocked_dequant_with_default_correctness(
+        1024, 1024, 1024, source_format="uint", bit=4, with_scaling=True, with_zeros=True)
+    assert_matmul_blocked_dequant_with_default_correctness(
+        1024, 1024, 1024, source_format="uint", bit=4, fast_decoding=True)
+    assert_matmul_blocked_dequant_with_default_correctness(
+        1024, 1024, 1024, source_format="uint", bit=4, with_scaling=True, fast_decoding=True)
+    assert_matmul_blocked_dequant_with_default_correctness(
+        1024,
+        1024,
+        1024,
+        source_format="uint",
+        bit=4,
+        with_scaling=True,
+        with_zeros=True,
+        fast_decoding=True)
 
 
 if __name__ == "__main__":

diff --git a/testing/python/operators/test_general_matmul_tilelang_scheduler.py b/testing/python/operators/test_general_matmul_tilelang_scheduler.py
@@ -78,7 +78,7 @@ def assert_dequantize_scheduler_simplify(
 
     simplified = MatmulDequantizeScheduler.Simplify(matmul)
     print(simplified)
-    is_equal = structural_equal(matmul, simplified)
+    is_equal = structural_equal(matmul, simplified)  # noqa: F841
     assert simplified is not None, "Simplify should return a schedule"
 
 

diff --git a/testing/python/tilelang/test_tilelang_dequantize_gemm.py b/testing/python/tilelang/test_tilelang_dequantize_gemm.py
@@ -437,4 +437,5 @@ def test_assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4():
 
 
 if __name__ == "__main__":
-    bitblas.testing.main()
+    # bitblas.testing.main()
+    run_gemm(256, 256, 256, "float16", "float16", "float16", 128, 128, 32, num_threads=128)