jiqing-feng · jiqing-feng · Dec 5, 2024 · Dec 3, 2024 · Dec 3, 2024 · Dec 4, 2024
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
@@ -25,7 +25,9 @@
 
 from packaging import version
 
-from ..utils import is_auto_awq_available, is_hqq_available, is_torch_available, is_torchao_available, logging
+from .import_utils import is_auto_gptq_available
+from ..utils import (is_auto_awq_available, is_hqq_available, is_torch_available, is_gptqmodel_available,
+                     is_torchao_available, logging)
 
 
 if is_torch_available():
@@ -577,8 +579,14 @@ class GPTQConfig(QuantizationConfigMixin):
             quantization using inputs that have passed through the previously quantized layers.
         checkpoint_format (`str`, *optional*, defaults to `"gptq"`):
             GPTQ weight format. `gptq`(v1) is supported by both gptqmodel and auto-gptq. `gptq_v2` is gptqmodel only.
+        meta (`Dict[str, any]`, *optional*):
+            Properties, such as tooling:version, that do not directly contributes to quantization or quant inference are stored in meta.
+            i.e. `meta.quantizer`: ["optimum:_version_", "gptqmodel:_version_"]
+        backend (`str`, *optional*):
+            Controls which gptq kernel to be used. Valid values for gptqmodel are `auto`, `auto_trainable` and more. For auto-gptq, only 
+            valid value is None and `auto_trainable`. Ref gptqmodel backends: https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py
         use_cuda_fp16 (`bool`, *optional*, defaults to `False`):
-            Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16.
+            Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16. Auto-gptq only.
         model_seqlen (`int`, *optional*):
             The maximum sequence length that the model can take.
         block_name_to_quantize (`str`, *optional*):
@@ -618,7 +626,9 @@ def __init__(
         desc_act: bool = False,
         sym: bool = True,
         true_sequential: bool = True,
-        checkpoint_format: Optional[str] = "gptq",
+        checkpoint_format: str = "gptq",
+        meta: Optional[Dict[str, any]] = None,
+        backend: Optional[str] = None,
         use_cuda_fp16: bool = False,
         model_seqlen: Optional[int] = None,
         block_name_to_quantize: Optional[str] = None,
@@ -641,6 +651,9 @@ def __init__(
         self.desc_act = desc_act
         self.sym = sym
         self.true_sequential = true_sequential
+        self.checkpoint_format = checkpoint_format.lower()
+        self.meta = meta
+        self.backend = backend.lower() if isinstance(backend, str) else backend
         self.use_cuda_fp16 = use_cuda_fp16
         self.model_seqlen = model_seqlen
         self.block_name_to_quantize = block_name_to_quantize
@@ -653,7 +666,6 @@ def __init__(
         self.disable_exllama = kwargs.pop("disable_exllama", None)
         self.cache_block_outputs = cache_block_outputs
         self.modules_in_block_to_quantize = modules_in_block_to_quantize
-        self.checkpoint_format = checkpoint_format
         self.post_init()
 
     def get_loading_attributes(self):
@@ -690,6 +702,17 @@ def post_init(self):
                     ['wikitext2','c4','c4-new'], but we found {self.dataset}"""
                 )
 
+        # make sure backend is back/forward compatible with both gptqmodel (full) and auto-gptq (partial)
+        if is_gptqmodel_available():
+            # convert auto-gptq control into gptqmodel backend
+            if self.backend is None:
+                self.backend = "auto_trainable" if self.use_exllama == False else "auto"
+        else:
+            # convert gptqmodel backend `auto_trainable` into auto-gptq control
+            if self.backend == "auto_trainable":
+                self.use_exllama = False
+
+        # auto-gptq specific kernel control logic
         if self.disable_exllama is None and self.use_exllama is None:
             # New default behaviour
             self.use_exllama = True
@@ -723,19 +746,21 @@ def post_init(self):
                     "speed using exllamav2 kernel by setting `exllama_config`."
                 )
             elif self.exllama_config["version"] == ExllamaVersion.TWO:
-                optimum_version = version.parse(importlib.metadata.version("optimum"))
-                autogptq_version = version.parse(importlib.metadata.version("auto_gptq"))
-                if optimum_version <= version.parse("1.13.2") or autogptq_version <= version.parse("0.4.2"):
-                    raise ValueError(
-                        f"You need optimum > 1.13.2 and auto-gptq > 0.4.2 . Make sure to have that version installed - detected version : optimum {optimum_version} and autogptq {autogptq_version}"
-                    )
+                if is_auto_gptq_available():
+                    optimum_version = version.parse(importlib.metadata.version("optimum"))
+                    autogptq_version = version.parse(importlib.metadata.version("auto_gptq"))
+                    if optimum_version <= version.parse("1.13.2") or autogptq_version <= version.parse("0.4.2"):
+                        raise ValueError(
+                            f"You need optimum > 1.13.2 and auto-gptq > 0.4.2 . Make sure to have that version installed - detected version : optimum {optimum_version} and autogptq {autogptq_version}"
+                        )
         if self.modules_in_block_to_quantize is not None:
             optimum_version = version.parse(importlib.metadata.version("optimum"))
             if optimum_version < version.parse("1.15.0"):
                 raise ValueError(
                     "You current version of `optimum` does not support `modules_in_block_to_quantize` quantization argument, please upgrade `optimum` package to a version superior than 1.15.0 ."
                 )
 
+
     def to_dict(self):
         config_dict = super().to_dict()
         config_dict.pop("disable_exllama", None)

diff --git a/tests/quantization/gptq/test_gptq.py b/tests/quantization/gptq/test_gptq.py
@@ -18,7 +18,7 @@
 
 import pytest
 
-from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
+from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig, AutoConfig
 from transformers.testing_utils import (
     is_torch_available,
     require_accelerate,
@@ -84,12 +84,14 @@ class GPTQTest(unittest.TestCase):
     input_text = "Hello my name is"
 
     EXPECTED_OUTPUTS = set()
+    # flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions
     EXPECTED_OUTPUTS.add("Hello my name is Katie, I am a 22 year")
 
     # this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings
     EXPECTED_RELATIVE_DIFFERENCE = 2.06183008
 
     bits = 4
+    sym = True
     group_size = 128
     desc_act = False
     use_exllama = False
@@ -112,21 +114,23 @@ def setUpClass(cls):
         cls.mem_fp16 = cls.model_fp16.get_memory_footprint()
 
         cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)
+        cls.config = AutoConfig.from_pretrained(cls.model_name)
 
-        quantization_config = GPTQConfig(
+        cls.quantization_config = GPTQConfig(
             bits=cls.bits,
             dataset=cls.dataset,
             tokenizer=cls.tokenizer,
             group_size=cls.group_size,
             desc_act=cls.desc_act,
+            sym=cls.sym,
             use_exllama=cls.use_exllama,
         )
 
         cls.quantized_model = AutoModelForCausalLM.from_pretrained(
             cls.model_name,
             torch_dtype=torch.float16,
             device_map=cls.device_map,
-            quantization_config=quantization_config,
+            quantization_config=cls.quantization_config,
         )
 
     def test_memory_footprint(self):
@@ -167,14 +171,21 @@ def test_quantized_layers_class(self):
         """
         if is_gptqmodel_available():
             from gptqmodel.utils.importer import hf_select_quant_linear
-
+            if hasattr(self.config, "quantization_config"):
+                checkpoint_format = self.config.quantization_config.get("checkpoint_format")
+                meta = self.config.quantization_config.get("meta")
+            else:
+                checkpoint_format = "gptq"
+                meta = None
             QuantLinear = hf_select_quant_linear(
                 bits=self.bits,
                 group_size=self.group_size,
                 desc_act=self.desc_act,
-                sym=True,
+                sym=self.sym,
                 device_map=self.device_map,
-                pack=False,
+                checkpoint_format=checkpoint_format,
+                meta=meta,
+                backend=self.quantization_config.backend,
             )
         elif is_auto_gptq_available():
             from auto_gptq.utils.import_utils import dynamically_import_QuantLinear as hf_select_quant_linear
@@ -187,7 +198,7 @@ def test_quantized_layers_class(self):
                 disable_exllama=not self.use_exllama,
                 disable_exllamav2=True,
             )
-        self.assertTrue(self.quantized_model.model.layers[0].mlp.gate_proj.__class__ == QuantLinear)
+        self.assertEqual(self.quantized_model.model.layers[0].mlp.gate_proj.__class__, QuantLinear)
 
     def check_inference_correctness(self, model):
         r"""
@@ -205,13 +216,13 @@ def check_inference_correctness(self, model):
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
 
     def check_quantized_layers_type(self, model, value):
-        self.assertTrue(model.model.layers[0].mlp.gate_proj.QUANT_TYPE == value)
+        self.assertEqual(model.model.layers[0].mlp.gate_proj.QUANT_TYPE, value)
 
     def test_generate_quality(self):
         """
         Simple test to check the quality of the model by comparing the generated tokens with the expected tokens
         """
-        if self.device_map != "cpu":
+        if self.device_map is None:
             self.check_inference_correctness(self.quantized_model.to(0))
         else:
             self.check_inference_correctness(self.quantized_model)
@@ -235,7 +246,7 @@ def test_serialization(self):
                         tmpdirname, device_map=self.device_map
                     )
             else:
-                quant_type = "ipex" if self.device_map == "cpu" else "cuda"
+                quant_type = "ipex" if self.device_map == "cpu" else "exllama"
                 quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
                     tmpdirname, device_map=self.device_map
                 )
@@ -259,6 +270,12 @@ class GPTQTestCUDA(GPTQTest):
     EXPECTED_RELATIVE_DIFFERENCE = 2.06183008
     device_map = {"": 0}
 
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        # flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions
+        cls.EXPECTED_OUTPUTS.add("Hello my name is Katie. I am a 20 year")
+
     def test_change_loading_attributes(self):
         """
         Test the serialization of the model and the loading of the quantized weights works with another config file
@@ -302,6 +319,7 @@ class GPTQTestActOrderExllama(unittest.TestCase):
     """
 
     EXPECTED_OUTPUTS = set()
+    # flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions
     EXPECTED_OUTPUTS.add("Hello, how are you ? I'm doing good, thanks for asking.")
     # 4bit + act_order + 128g
     model_name = "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ"
@@ -338,7 +356,7 @@ def check_inference_correctness(self, model):
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
 
     def test_quantized_layers_type(self):
-        self.assertTrue(self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE == "exllama")
+        self.assertEqual(self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE, "exllama")
 
     def test_generate_quality(self):
         """
@@ -377,6 +395,7 @@ class GPTQTestExllamaV2(unittest.TestCase):
     """
 
     EXPECTED_OUTPUTS = set()
+    # flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions
     EXPECTED_OUTPUTS.add("Hello, how are you ? I'm doing good, thanks for asking.")
     # 4bit + act_order + 128g
     model_name = "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ"
@@ -397,7 +416,7 @@ def setUpClass(cls):
         cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)
 
     def test_quantized_layers_type(self):
-        self.assertTrue(self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE == "exllamav2")
+        self.assertEqual(self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE, "exllama" if is_gptqmodel_available() else "exllamav2")
 
     def check_inference_correctness(self, model):
         """