adapt python code

Signed-off-by: changwangss <chang1.wang@intel.com>
intel · May 9, 2024 · bf511a3 · bf511a3
1 parent a0cac9f
commit bf511a3
Show file tree

Hide file tree

Showing 4 changed files with 17 additions and 22 deletions.
diff --git a/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py b/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py
@@ -213,9 +213,8 @@ def set_weights_bias(
             else:
                 g_idx = torch.empty(0, dtype=torch.int32)
         if q_config.bits == 4:
-            int_weight = (int_weight - 8) * 16
-            gptq_scales = gptq_scales / 16
-            gptq_zeros = (gptq_zeros - 8) * 16
+            int_weight = int_weight - 8
+            gptq_zeros = gptq_zeros - 8
 
         if q_config.sym:
             gptq_zeros = torch.empty(0, dtype=torch.int8)
@@ -344,13 +343,12 @@ def recover_int_weight(g_idx, int_weight):
         if scales_dtype is None:
             assert False, "scales dtype only support fp32."
         scales = qbits.acquire_packed_weight_info(self.weight, 9)
-        if bits == 4:
-            scales = scales * 16
+
         zp = qbits.acquire_packed_weight_info(self.weight, 11)[0] != 0
         if zp:
             qzeros = qbits.acquire_packed_weight_info(self.weight, 10)
             if bits == 4:
-                qzeros = qzeros // 16 + 8
+                qzeros = qzeros + 8
             else:
                 qzeros = (qzeros.to(torch.int32) + 128).to(torch.uint8)
         else:

diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py
@@ -281,7 +281,6 @@ def post_init_cpu(self):
             )
 
         if self.bits == 4 and self.weight_dtype not in [
-            "int4_fullrange",
             "int4_clip",
             "nf4",
             "fp4_e2m1_bnb",
@@ -300,7 +299,6 @@ def post_init_cpu(self):
 
         elif self.weight_dtype not in [
             "int8",
-            "int4_fullrange",
             "int4_clip",
             "nf4",
             "fp4_e2m1_bnb",
@@ -310,7 +308,7 @@ def post_init_cpu(self):
         ]:
             raise ValueError(
                 f"weight_dtype must be a string in "
-                f"'int8', 'int4_fullrange', 'int4_clip', 'nf4', 'fp4_e2m1_bnb', 'fp4_e2m1', 'fp8_e5m2, fp8_e4m3'"
+                f"'int8', 'int4_clip', 'nf4', 'fp4_e2m1_bnb', 'fp4_e2m1', 'fp8_e5m2, fp8_e4m3'"
             )
 
         if self.scale_dtype is not None and self.scale_dtype not in [

diff --git a/tests/CI/test_quantization.py b/tests/CI/test_quantization.py
@@ -408,18 +408,17 @@ def test_quantization_for_llm(self):
 
         # weight-only
         # RTN
-        woq_config = RtnConfig(bits=4, weight_dtype="int4_fullrange")
+        woq_config = RtnConfig(bits=4)
         woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                                     quantization_config=woq_config,
                                                     use_neural_speed=False
                                                 )
         woq_model.eval()
         output = woq_model(dummy_input)
-        self.assertTrue(isclose(float(output[0][0][0][0]), 0.16387596726417542, rel_tol=1e-04))
+        self.assertTrue(isclose(float(output[0][0][0][0]), 0.17631684243679047, rel_tol=1e-04))
 
         # AWQ
         woq_config = AwqConfig(bits=4,
-                                weight_dtype="int4_fullrange",
                                 zero_point=False,
                                 calib_iters=5,
                                 tokenizer=tokenizer
@@ -431,13 +430,13 @@ def test_quantization_for_llm(self):
                                                 )
         woq_model.eval()
         output = woq_model(dummy_input)
-        self.assertTrue(isclose(float(output[0][0][0][0]), 0.17998121678829193 , rel_tol=1e-04))
+        self.assertTrue(isclose(float(output[0][0][0][0]), 0.18019595742225647 , rel_tol=1e-04))
 
         # TEQ
-        woq_config = TeqConfig(bits=4, weight_dtype="int4_fullrange",
-                                           calib_iters=5,
-                                           tokenizer=tokenizer,
-                                           )
+        woq_config = TeqConfig(bits=4,
+                                calib_iters=5,
+                                tokenizer=tokenizer,
+                                )
         woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                                     quantization_config=woq_config,
                                                     use_neural_speed=False

diff --git a/tests/CI/test_weight_only.py b/tests/CI/test_weight_only.py
@@ -86,9 +86,9 @@ def tearDownClass(cls) -> None:
 
     def test_woq_config(self):
         config = RtnConfig(
-            bits=4, weight_dtype="int4_fullrange", group_size=32)
+            bits=4, weight_dtype="int4_clip", group_size=32)
         diff_res = config.to_diff_dict()
-        ref_config = {'weight_dtype': 'int4_fullrange'}
+        ref_config = {'weight_dtype': 'int4_clip'}
         self.assertEqual(diff_res, ref_config)
         print(diff_res)
         print(config.to_dict())
@@ -133,10 +133,10 @@ def test_int8(self):
     def test_int4(self):
         raw_wei = torch.rand(2, 32, dtype=torch.float)
         compress_wei = qbits.quantize_to_packed_weight(
-            raw_wei, True, 32, "fp32", "int4_fullrange", "fp32", False)
+            raw_wei, True, 32, "fp32", "nf4", "fp32", False)
         revert_wei = torch.zeros(2, 32, dtype=torch.float)
         qbits.dequantize_packed_weight(compress_wei, revert_wei, True,
-                             "fp32", "int4_fullrange", "fp32")
+                             "fp32", "nf4", "fp32")
         for bias in [True, False]:
             model = M(with_bias=bias)
             with torch.no_grad():
@@ -146,7 +146,7 @@ def test_int4(self):
             with torch.no_grad():
                 model.linear.weight = torch.nn.Parameter(raw_wei)
             config = RtnConfig(
-                bits=4, weight_dtype="int4_fullrange", group_size=32)
+                bits=4, weight_dtype="nf4", group_size=32)
             config.post_init_cpu()
             convert_to_quantized_model(model, config)
             output_quant = model(activation)