From 2b859cf81f18049f4a48e36e5c61da006860a772 Mon Sep 17 00:00:00 2001
From: PZS-ModelCloud <pzs@modelcloud.ai>
Date: Thu, 1 Aug 2024 10:08:06 +0000
Subject: [PATCH 1/4] add save_quantized log model total size

---
 gptqmodel/models/base.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index 22d0a331d..657255bf5 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -705,6 +705,9 @@ def save_quantized(
                 logger.warning(
                     "We highly suggest saving quantized model using safetensors format for security reasons. Please set `use_safetensors=True` whenever possible.")
                 torch.save(model.state_dict(), join(save_dir, model_save_name))
+            total_size_mb = os.path.getsize(join(save_dir, model_save_name)) / (1024 * 1024)
+            total_size_gb = total_size_mb / 1024
+            logger.info(f"Quantized model size: {total_size_mb:.2f}MB, {total_size_gb:.2f}GB")
         else:
             # Shard checkpoint
             shards, index = shard_checkpoint(state_dict, max_shard_size=max_shard_size, weights_name=model_save_name)
@@ -725,6 +728,7 @@ def save_quantized(
                 ):
                     os.remove(full_filename)
 
+            total_size_mb = 0
             # Save the model
             for shard_file, shard in shards.items():
                 if use_safetensors:
@@ -761,6 +765,10 @@ def save_quantized(
                     safe_save(shard, join(save_dir, shard_file), safetensors_metadata)
                 else:
                     torch.save(shard, join(save_dir, shard_file))
+                shard_size_mb = os.path.getsize(join(save_dir, shard_file)) / (1024 * 1024)
+                total_size_mb += shard_size_mb
+            total_size_gb = total_size_mb / 1024
+            logger.info(f"Quantized model size: {total_size_mb:.2f}MB, {total_size_gb:.2f}GB")
 
             if index is not None:
                 index_save_name = model_save_name + ".index.json"

From c2f3d44ab4366785a0c3f56f5352f26c1e01a5c8 Mon Sep 17 00:00:00 2001
From: PZS-ModelCloud <pzs@modelcloud.ai>
Date: Thu, 1 Aug 2024 11:28:53 +0000
Subject: [PATCH 2/4] add native model size log

---
 gptqmodel/models/base.py | 18 +++++++++++++-----
 gptqmodel/utils/model.py | 21 +++++++++++++++++++++
 2 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index 657255bf5..05ae50c6d 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -31,7 +31,7 @@
                            convert_gptq_v2_to_v1_format, copy_py_files, find_layers, get_checkpoints, get_device,
                            get_module_by_name_prefix, get_module_by_name_suffix, get_moe_layer_modules,
                            gptqmodel_post_init, make_quant, move_to, nested_move_to, pack_model,
-                           simple_dispatch_model, verify_model_hash, verify_sharded_model_hashes)
+                           simple_dispatch_model, verify_model_hash, verify_sharded_model_hashes, get_model_files_size)
 from ..version import __version__
 from ._const import CPU, CUDA_0, DEVICE, SUPPORTED_MODELS
 
@@ -609,6 +609,9 @@ def save_quantized(
         """save quantized model and configs to local disk"""
         os.makedirs(save_dir, exist_ok=True)
 
+        pre_quantized_size_mb = get_model_files_size(self.model_name_or_path)
+        pre_quantized_size_gb = pre_quantized_size_mb / 1024
+
         # write gptqmodel tooling fingerprint to config
         self.quantize_config.meta_set_versionable(
             key=META_FIELD_QUANTIZER,
@@ -706,8 +709,6 @@ def save_quantized(
                     "We highly suggest saving quantized model using safetensors format for security reasons. Please set `use_safetensors=True` whenever possible.")
                 torch.save(model.state_dict(), join(save_dir, model_save_name))
             total_size_mb = os.path.getsize(join(save_dir, model_save_name)) / (1024 * 1024)
-            total_size_gb = total_size_mb / 1024
-            logger.info(f"Quantized model size: {total_size_mb:.2f}MB, {total_size_gb:.2f}GB")
         else:
             # Shard checkpoint
             shards, index = shard_checkpoint(state_dict, max_shard_size=max_shard_size, weights_name=model_save_name)
@@ -767,8 +768,6 @@ def save_quantized(
                     torch.save(shard, join(save_dir, shard_file))
                 shard_size_mb = os.path.getsize(join(save_dir, shard_file)) / (1024 * 1024)
                 total_size_mb += shard_size_mb
-            total_size_gb = total_size_mb / 1024
-            logger.info(f"Quantized model size: {total_size_mb:.2f}MB, {total_size_gb:.2f}GB")
 
             if index is not None:
                 index_save_name = model_save_name + ".index.json"
@@ -777,6 +776,15 @@ def save_quantized(
                 with open(index_save_path, "w", encoding="utf-8") as f:
                     content = json.dumps(index, indent=2, sort_keys=True) + "\n"
                     f.write(content)
+
+        total_size_gb = total_size_mb / 1024
+        size_diff_mb = pre_quantized_size_mb - total_size_mb
+        size_diff_gb = size_diff_mb / 1024
+        percent_diff = (size_diff_mb / pre_quantized_size_mb) * 100
+        logger.info(f"Pre-Quantized model size: {pre_quantized_size_mb:.2f}MB, {pre_quantized_size_gb:.2f}GB")
+        logger.info(f"Quantized model size: {total_size_mb:.2f}MB, {total_size_gb:.2f}GB")
+        logger.info(f"Size difference: {size_diff_mb:.2f}MB, {size_diff_gb:.2f}GB - {percent_diff:.2f}%")
+
         config.quantization_config = quantize_config.to_dict()
         config.save_pretrained(save_dir)
 
diff --git a/gptqmodel/utils/model.py b/gptqmodel/utils/model.py
index 2b2957ed0..310f85049 100644
--- a/gptqmodel/utils/model.py
+++ b/gptqmodel/utils/model.py
@@ -663,3 +663,24 @@ def copy_py_files(save_dir, file_extension=".py", model_id_or_path=""):
             if file.rfilename.endswith(file_extension):
                 _ = hf_hub_download(repo_id=model_id_or_path, filename=file.rfilename,
                                                   local_dir=save_dir)
+
+def get_model_files_size(pre_quantized_model_path, file_extension=['.bin', '.safetensors', '.pth', '.pt', '.ckpt', '.h5', '.pb', '.onnx']):
+    if os.path.isdir(pre_quantized_model_path):
+        pre_quantized_size_bytes = sum(
+            os.path.getsize(os.path.join(pre_quantized_model_path, f))
+            for f in os.listdir(pre_quantized_model_path)
+            if os.path.isfile(os.path.join(pre_quantized_model_path, f)) and os.path.splitext(f)[
+                1] in file_extension
+        )
+    else:
+        api = HfApi()
+        files_data = api.list_repo_files(pre_quantized_model_path)
+        pre_quantized_size_bytes = 0
+        for file_info in files_data:
+            if any(file_info.endswith(ext) for ext in file_extension):
+                file_metadata = api.model_info(pre_quantized_model_path, files_metadata=True)
+                for file_data in file_metadata.siblings:
+                    if file_data.rfilename == file_info:
+                        pre_quantized_size_bytes += file_data.size
+    pre_quantized_size_mb = pre_quantized_size_bytes / (1024 * 1024)
+    return pre_quantized_size_mb
\ No newline at end of file

From feda20df9f81504dbe30ce3bb1f759ccc9ce6c6c Mon Sep 17 00:00:00 2001
From: PZS-ModelCloud <pzs@modelcloud.ai>
Date: Thu, 1 Aug 2024 11:50:05 +0000
Subject: [PATCH 3/4] move config damp_percent to meta

---
 gptqmodel/models/base.py         |  4 ++--
 gptqmodel/quantization/config.py | 12 +++++-------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index 05ae50c6d..4e6eb91b7 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -19,7 +19,7 @@
 
 from ..nn_modules.qlinear.qlinear_qbits import QBitsQuantLinear, qbits_dtype
 from ..quantization import GPTQ, QuantizeConfig
-from ..quantization.config import (FORMAT, FORMAT_FIELD_JSON, META_FIELD_QUANTIZER, META_QUANTIZER_GPTQMODEL,
+from ..quantization.config import (FORMAT, FORMAT_FIELD_JSON, META_FIELD_QUANTIZER, META_FIELD_DAMP_PERCENT, META_QUANTIZER_GPTQMODEL,
                                    MIN_VERSION_WITH_V2, QUANTIZE_BLACK_LIST, AutoRoundQuantizeConfig)
 from ..utils.backend import BACKEND
 from ..utils.data import collate_data
@@ -486,7 +486,7 @@ def tmp(_, inp, out):
 
                     try:
                         scale, zero, g_idx, duration, avg_loss, bits = gptq[name].fasterquant(
-                            percdamp=self.quantize_config.damp_percent,
+                            percdamp=self.quantize_config.meta_get(META_FIELD_DAMP_PERCENT),
                             group_size=self.quantize_config.group_size,
                             actorder=self.quantize_config.desc_act,
                             static_groups=self.quantize_config.static_groups,
diff --git a/gptqmodel/quantization/config.py b/gptqmodel/quantization/config.py
index 57726992d..5a0c585fd 100644
--- a/gptqmodel/quantization/config.py
+++ b/gptqmodel/quantization/config.py
@@ -28,7 +28,7 @@
 META_FIELD = "meta"
 # quantizer is the tool that did the quantization
 META_FIELD_QUANTIZER = "quantizer"
-
+META_FIELD_DAMP_PERCENT = "damp_percent"
 META_QUANTIZER_GPTQMODEL = "gptqmodel"
 
 # pkg names
@@ -83,7 +83,6 @@ class QuantizeConfig():
     # 128 offer good balance between inference speed and quantization quality
     group_size: int = field(default=128)
     # increase damp if NaN is encountred during `.quantize()` and/or increase calib dataset size
-    damp_percent: float = field(default=0.005)
     desc_act: bool = field(default=True)
     static_groups: bool = field(default=False)
     sym: bool = field(default=True)
@@ -128,9 +127,6 @@ def __post_init__(self):
         if self.group_size != -1 and self.group_size <= 0:
             raise ValueError("unless equal to -1, group_size must greater then 0.")
 
-        if not (0 < self.damp_percent < 1):
-            raise ValueError("damp_percent must between 0 and 1.")
-
         # validate meta
         if self.meta is not None:
             if not isinstance(self.meta, dict):
@@ -138,8 +134,11 @@ def __post_init__(self):
             for key, value in self.meta.items():
                 if not isinstance(key, str):
                     raise ValueError("Keys in the meta dictionary must be strings")
+                if key is META_FIELD_DAMP_PERCENT:
+                    if not (0 < value < 1):
+                        raise ValueError(f"{META_FIELD_DAMP_PERCENT} must between 0 and 1.")
         else:
-            self.meta = {}
+            self.meta = {META_FIELD_DAMP_PERCENT: 0.005}
 
     def meta_set(self, key: str, value: Any):
         self.meta[key] = value
@@ -301,7 +300,6 @@ def to_dict(self):
             "static_groups": self.static_groups,
             "sym": self.sym,
             "lm_head": self.lm_head,
-            "damp_percent": self.damp_percent,
             "true_sequential": self.true_sequential,
             # TODO: deprecate?
             "model_name_or_path": self.model_name_or_path,

From b28c24f282abee4be0564bc34f61d3d905e18a89 Mon Sep 17 00:00:00 2001
From: PZS-ModelCloud <pzs@modelcloud.ai>
Date: Fri, 2 Aug 2024 01:09:36 +0000
Subject: [PATCH 4/4] revert damp_percent mod

---
 gptqmodel/models/base.py         |  4 ++--
 gptqmodel/quantization/config.py | 12 +++++++-----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index ced023d9f..01d2262a0 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -19,7 +19,7 @@
 
 from ..nn_modules.qlinear.qlinear_qbits import QBitsQuantLinear, qbits_dtype
 from ..quantization import GPTQ, QuantizeConfig
-from ..quantization.config import (FORMAT, FORMAT_FIELD_JSON, META_FIELD_QUANTIZER, META_FIELD_DAMP_PERCENT, META_QUANTIZER_GPTQMODEL,
+from ..quantization.config import (FORMAT, FORMAT_FIELD_JSON, META_FIELD_QUANTIZER, META_QUANTIZER_GPTQMODEL,
                                    MIN_VERSION_WITH_V2, QUANTIZE_BLACK_LIST, AutoRoundQuantizeConfig)
 from ..utils.backend import BACKEND
 from ..utils.data import collate_data
@@ -485,7 +485,7 @@ def tmp(_, inp, out):
 
                     try:
                         scale, zero, g_idx, duration, avg_loss, bits = gptq[name].fasterquant(
-                            percdamp=self.quantize_config.meta_get(META_FIELD_DAMP_PERCENT),
+                            percdamp=self.quantize_config.damp_percent,
                             group_size=self.quantize_config.group_size,
                             actorder=self.quantize_config.desc_act,
                             static_groups=self.quantize_config.static_groups,
diff --git a/gptqmodel/quantization/config.py b/gptqmodel/quantization/config.py
index 5a0c585fd..57726992d 100644
--- a/gptqmodel/quantization/config.py
+++ b/gptqmodel/quantization/config.py
@@ -28,7 +28,7 @@
 META_FIELD = "meta"
 # quantizer is the tool that did the quantization
 META_FIELD_QUANTIZER = "quantizer"
-META_FIELD_DAMP_PERCENT = "damp_percent"
+
 META_QUANTIZER_GPTQMODEL = "gptqmodel"
 
 # pkg names
@@ -83,6 +83,7 @@ class QuantizeConfig():
     # 128 offer good balance between inference speed and quantization quality
     group_size: int = field(default=128)
     # increase damp if NaN is encountred during `.quantize()` and/or increase calib dataset size
+    damp_percent: float = field(default=0.005)
     desc_act: bool = field(default=True)
     static_groups: bool = field(default=False)
     sym: bool = field(default=True)
@@ -127,6 +128,9 @@ def __post_init__(self):
         if self.group_size != -1 and self.group_size <= 0:
             raise ValueError("unless equal to -1, group_size must greater then 0.")
 
+        if not (0 < self.damp_percent < 1):
+            raise ValueError("damp_percent must between 0 and 1.")
+
         # validate meta
         if self.meta is not None:
             if not isinstance(self.meta, dict):
@@ -134,11 +138,8 @@ def __post_init__(self):
             for key, value in self.meta.items():
                 if not isinstance(key, str):
                     raise ValueError("Keys in the meta dictionary must be strings")
-                if key is META_FIELD_DAMP_PERCENT:
-                    if not (0 < value < 1):
-                        raise ValueError(f"{META_FIELD_DAMP_PERCENT} must between 0 and 1.")
         else:
-            self.meta = {META_FIELD_DAMP_PERCENT: 0.005}
+            self.meta = {}
 
     def meta_set(self, key: str, value: Any):
         self.meta[key] = value
@@ -300,6 +301,7 @@ def to_dict(self):
             "static_groups": self.static_groups,
             "sym": self.sym,
             "lm_head": self.lm_head,
+            "damp_percent": self.damp_percent,
             "true_sequential": self.true_sequential,
             # TODO: deprecate?
             "model_name_or_path": self.model_name_or_path,