From f2db23df40a887460d2f49e3c563c2310785d155 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Tue, 4 Jun 2024 13:35:39 -0400
Subject: [PATCH] Refactor rest of tinygemm quant primitive ops

Summary:
This PR replaces the remaining tinygemm specific quant primitive ops with the general quant primitive ops
that we want to use for everything, we could delete these ops in a separate PR if needed

Test Plan:
python test/quantization/test_quant_primitives.py -k test_get_groupwise_affine_qparams
python test/quantization/test_quant_primitives.py -k test_groupwise_affine_quantize_tensor_from_qparams
python test/quantization/test_quant_primitives.py -k test_groupwise_affine_dequantize_tensor_from_qparams

accuracy:

perf:
no diff for generated code with `TORCH_LOGS='output_code' python tutorials/quantize_vit/run_vit_b_quant.py`
---
 test/quantization/test_quant_primitives.py | 109 ++++++++++++++++++++-
 torchao/quantization/quant_primitives.py   |  76 +++++++-------
 2 files changed, 142 insertions(+), 43 deletions(-)

diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py
index 3ce53cbde..6054c6e66 100644
--- a/test/quantization/test_quant_primitives.py
+++ b/test/quantization/test_quant_primitives.py
@@ -11,6 +11,8 @@
 from torchao.quantization.quant_primitives import (
     get_group_qparams_symmetric,
     get_groupwise_affine_qparams,
+    groupwise_affine_quantize_tensor_from_qparams,
+    groupwise_affine_dequantize_tensor_from_qparams,
     quantize_affine,
     dequantize_affine,
     choose_qparams_affine,
@@ -38,6 +40,86 @@ def check_idempotent(self, fn, *args, **kwargs):
     self.assertTrue(torch.equal(output0, output1), f"Expected given function {fn} to be idempotent.")
     return output1
 
+# Legacy tinygemm ops
+def _get_groupwise_affine_qparams(w, n_bit=4, groupsize=128, dtype=torch.bfloat16):
+    if groupsize > w.shape[-1]:
+        groupsize = w.shape[-1]
+    assert groupsize > 1
+    assert w.shape[-1] % groupsize == 0
+    assert w.dim() == 2
+
+    to_quant = w.reshape(-1, groupsize)
+    # assert torch.isnan(to_quant).sum() == 0
+
+    max_val = to_quant.amax(dim=1, keepdim=True)
+    min_val = to_quant.amin(dim=1, keepdim=True)
+    max_int = 2**n_bit - 1
+    scales = (max_val - min_val).clamp(min=1e-6) / max_int
+    zeros = min_val + scales * (2 ** (n_bit - 1))
+    return scales.to(dtype=dtype).reshape(w.shape[0], -1), zeros.to(
+        dtype=dtype
+    ).reshape(w.shape[0], -1)
+
+def _groupwise_affine_quantize_tensor_from_qparams(
+    w,
+    scales,
+    zeros,
+    n_bit=4,
+    groupsize=128,
+):
+    assert groupsize > 1
+    # needed for GPTQ single column quantize
+    if groupsize > w.shape[-1] and scales.shape[-1] == 1:
+        groupsize = w.shape[-1]
+
+    assert w.shape[-1] % groupsize == 0
+    assert w.dim() == 2
+
+    to_quant = w.reshape(-1, groupsize)
+    # assert torch.isnan(to_quant).sum() == 0
+
+    scales = scales.reshape(-1, 1)
+    zeros = zeros.reshape(-1, 1)
+    min_val = zeros - scales * (2 ** (n_bit - 1))
+    max_int = 2**n_bit - 1
+    min_int = 0
+    w_int4x8 = (
+        to_quant.sub(min_val)
+        .div(scales)
+        .round()
+        .clamp_(min_int, max_int)
+        .to(torch.int32)
+        .reshape_as(w)
+    )
+
+    return w_int4x8
+
+def _groupwise_affine_dequantize_tensor_from_qparams(
+    w_int4x8,
+    scales,
+    zeros,
+    n_bit=4,
+    groupsize=128,
+):
+    assert groupsize > 1
+    # needed for GPTQ single column dequantize
+    if groupsize > w_int4x8.shape[-1] and scales.shape[-1] == 1:
+        groupsize = w_int4x8.shape[-1]
+    assert w_int4x8.shape[-1] % groupsize == 0
+    assert w_int4x8.dim() == 2
+
+    w_int4x8_grouped = w_int4x8.reshape(-1, groupsize)
+    scales = scales.reshape(-1, 1)
+    zeros = zeros.reshape(-1, 1)
+
+    w_dq = (
+        w_int4x8_grouped.sub(2 ** (n_bit - 1))
+        .mul(scales)
+        .add(zeros)
+        .reshape_as(w_int4x8)
+    )
+    return w_dq
+
 
 class TestQuantPrimitives(unittest.TestCase):
     SEED = 123
@@ -356,12 +438,12 @@ def test_not_preserve_zero_not_supported(self):
             )
 
 
-    def test_tinygemm_get_groupwise_affine_qparams(self):
+    def test_get_groupwise_affine_qparams(self):
         from torchao.quantization.quant_primitives import ZeroPointDomain
 
         input = torch.randn(10, 256)
         n_bit = 4
-        scale_ref, zero_point_ref = get_groupwise_affine_qparams(input, n_bit=n_bit, groupsize=128, dtype=torch.bfloat16)
+        scale_ref, zero_point_ref = _get_groupwise_affine_qparams(input, n_bit=n_bit, groupsize=128, dtype=torch.bfloat16)
 
         mapping_type = MappingType.ASYMMETRIC
         dtype = torch.int8
@@ -389,6 +471,29 @@ def test_tinygemm_get_groupwise_affine_qparams(self):
         self.assertTrue(torch.equal(scale, scale_ref))
         self.assertTrue(torch.equal(zero_point, zero_point_ref))
 
+    def test_groupwise_affine_quantize_tensor_from_qparams(self):
+        input = torch.randn(10, 256)
+        scales = torch.randn(10, 2)
+        zeros = torch.randn(10, 2)
+        n_bit = 4
+        groupsize = 128
+
+        w_int4x8 = groupwise_affine_quantize_tensor_from_qparams(input, scales, zeros, n_bit, groupsize)
+        w_int4x8_ref = _groupwise_affine_quantize_tensor_from_qparams(input, scales, zeros, n_bit, groupsize)
+
+        self.assertTrue(torch.equal(w_int4x8, w_int4x8_ref))
+
+    def test_groupwise_affine_dequantize_tensor_from_qparams(self):
+        input = torch.randint(0, 15, (10, 256), dtype=torch.int32)
+        scales = torch.randn(10, 2).bfloat16()
+        zeros = torch.randn(10, 2).bfloat16()
+        n_bit = 4
+        groupsize = 128
+
+        w_bf16 = groupwise_affine_dequantize_tensor_from_qparams(input, scales, zeros, n_bit, groupsize)
+        w_bf16_ref = _groupwise_affine_dequantize_tensor_from_qparams(input, scales, zeros, n_bit, groupsize)
+
+        self.assertTrue(torch.equal(w_bf16, w_bf16_ref))
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
index 7cf6efd9d..d1ad1e740 100644
--- a/torchao/quantization/quant_primitives.py
+++ b/torchao/quantization/quant_primitives.py
@@ -249,7 +249,7 @@ def dequantize_affine(
 
     # TODO: validations
     # TODO: validate scale/zero_point dimensions are compatible with block_size
-    assert input.dtype == input_dtype
+    assert input.dtype == input_dtype, f"Expected: {input_dtype}, got: {input.dtype}"
     assert output_dtype in [torch.float32, torch.float16, torch.bfloat16], f"Unsupported output dtype: {output_dtype}"
     quant_min, quant_max = _get_and_check_qmin_qmax(input_dtype, quant_min, quant_max)
 
@@ -644,22 +644,37 @@ def quant_int8_per_token_matmul(
 
 
 def get_groupwise_affine_qparams(w, n_bit=4, groupsize=128, dtype=torch.bfloat16):
-    """This is tinygemm specific, we'll keep this for now"""
     if groupsize > w.shape[-1]:
         groupsize = w.shape[-1]
     assert groupsize > 1
     assert w.shape[-1] % groupsize == 0
     assert w.dim() == 2
+    assert n_bit <= 8, f"only n_bit smaller than 8 is supported, got: {n_bit}"
 
-    to_quant = w.reshape(-1, groupsize)
-    # assert torch.isnan(to_quant).sum() == 0
+    mapping_type = MappingType.ASYMMETRIC
+    target_dtype = torch.int32
+    block_size = (1, groupsize)
+    quant_min = 0
+    quant_max = 2**n_bit - 1
+    eps = 1e-6
+    scale_dtype = dtype
+    zero_point_dtype = dtype
+
+    scale, zero_point = choose_qparams_affine(
+        w,
+        mapping_type,
+        block_size,
+        target_dtype,
+        quant_min,
+        quant_max,
+        eps,
+        scale_dtype=scale_dtype,
+        zero_point_dtype=zero_point_dtype,
+        preserve_zero=False,
+        zero_point_domain=ZeroPointDomain.FLOAT
+    )
 
-    max_val = to_quant.amax(dim=1, keepdim=True)
-    min_val = to_quant.amin(dim=1, keepdim=True)
-    max_int = 2**n_bit - 1
-    scales = (max_val - min_val).clamp(min=1e-6) / max_int
-    zeros = min_val + scales * (2 ** (n_bit - 1))
-    return scales.to(dtype=dtype).reshape(w.shape[0], -1), zeros.to(
+    return scale.to(dtype=dtype).reshape(w.shape[0], -1), zero_point.to(
         dtype=dtype
     ).reshape(w.shape[0], -1)
 
@@ -692,7 +707,6 @@ def groupwise_affine_quantize_tensor_from_qparams(
     n_bit=4,
     groupsize=128,
 ):
-    """This is tinygemm specific, we'll keep this for now"""
     assert groupsize > 1
     # needed for GPTQ single column quantize
     if groupsize > w.shape[-1] and scales.shape[-1] == 1:
@@ -701,25 +715,12 @@ def groupwise_affine_quantize_tensor_from_qparams(
     assert w.shape[-1] % groupsize == 0
     assert w.dim() == 2
 
-    to_quant = w.reshape(-1, groupsize)
-    # assert torch.isnan(to_quant).sum() == 0
-
-    scales = scales.reshape(-1, 1)
-    zeros = zeros.reshape(-1, 1)
-    min_val = zeros - scales * (2 ** (n_bit - 1))
-    max_int = 2**n_bit - 1
-    min_int = 0
-    w_int4x8 = (
-        to_quant.sub(min_val)
-        .div(scales)
-        .round()
-        .clamp_(min_int, max_int)
-        .to(torch.int32)
-        .reshape_as(w)
-    )
-
-    return w_int4x8
+    block_size = (1, groupsize)
+    output_dtype = torch.int32
+    quant_min = 0
+    quant_max = 2 ** n_bit - 1
 
+    return quantize_affine(w, block_size, scales, zeros, output_dtype, quant_min, quant_max, zero_point_domain = ZeroPointDomain.FLOAT)
 
 def groupwise_affine_dequantize_tensor_from_qparams(
     w_int4x8,
@@ -728,7 +729,6 @@ def groupwise_affine_dequantize_tensor_from_qparams(
     n_bit=4,
     groupsize=128,
 ):
-    """This is tinygemm specific, we'll keep this for now"""
     assert groupsize > 1
     # needed for GPTQ single column dequantize
     if groupsize > w_int4x8.shape[-1] and scales.shape[-1] == 1:
@@ -736,17 +736,11 @@ def groupwise_affine_dequantize_tensor_from_qparams(
     assert w_int4x8.shape[-1] % groupsize == 0
     assert w_int4x8.dim() == 2
 
-    w_int4x8_grouped = w_int4x8.reshape(-1, groupsize)
-    scales = scales.reshape(-1, 1)
-    zeros = zeros.reshape(-1, 1)
-
-    w_dq = (
-        w_int4x8_grouped.sub(2 ** (n_bit - 1))
-        .mul(scales)
-        .add(zeros)
-        .reshape_as(w_int4x8)
-    )
-    return w_dq
+    block_size = (1, groupsize)
+    input_dtype = torch.int32
+    quant_min = 0
+    quant_max = 2**n_bit - 1
+    return dequantize_affine(w_int4x8, block_size, scales, zeros, input_dtype, quant_min, quant_max, zero_point_domain=ZeroPointDomain.FLOAT, output_dtype=scales.dtype)
 
 
 def groupwise_affine_quantize_tensor(w, n_bit=4, groupsize=128, dtype=torch.bfloat16):