From 2b859cf81f18049f4a48e36e5c61da006860a772 Mon Sep 17 00:00:00 2001 From: PZS-ModelCloud Date: Thu, 1 Aug 2024 10:08:06 +0000 Subject: [PATCH 1/4] add save_quantized log model total size --- gptqmodel/models/base.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 22d0a331d..657255bf5 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -705,6 +705,9 @@ def save_quantized( logger.warning( "We highly suggest saving quantized model using safetensors format for security reasons. Please set `use_safetensors=True` whenever possible.") torch.save(model.state_dict(), join(save_dir, model_save_name)) + total_size_mb = os.path.getsize(join(save_dir, model_save_name)) / (1024 * 1024) + total_size_gb = total_size_mb / 1024 + logger.info(f"Quantized model size: {total_size_mb:.2f}MB, {total_size_gb:.2f}GB") else: # Shard checkpoint shards, index = shard_checkpoint(state_dict, max_shard_size=max_shard_size, weights_name=model_save_name) @@ -725,6 +728,7 @@ def save_quantized( ): os.remove(full_filename) + total_size_mb = 0 # Save the model for shard_file, shard in shards.items(): if use_safetensors: @@ -761,6 +765,10 @@ def save_quantized( safe_save(shard, join(save_dir, shard_file), safetensors_metadata) else: torch.save(shard, join(save_dir, shard_file)) + shard_size_mb = os.path.getsize(join(save_dir, shard_file)) / (1024 * 1024) + total_size_mb += shard_size_mb + total_size_gb = total_size_mb / 1024 + logger.info(f"Quantized model size: {total_size_mb:.2f}MB, {total_size_gb:.2f}GB") if index is not None: index_save_name = model_save_name + ".index.json" From c2f3d44ab4366785a0c3f56f5352f26c1e01a5c8 Mon Sep 17 00:00:00 2001 From: PZS-ModelCloud Date: Thu, 1 Aug 2024 11:28:53 +0000 Subject: [PATCH 2/4] add native model size log --- gptqmodel/models/base.py | 18 +++++++++++++----- gptqmodel/utils/model.py | 21 +++++++++++++++++++++ 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 657255bf5..05ae50c6d 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -31,7 +31,7 @@ convert_gptq_v2_to_v1_format, copy_py_files, find_layers, get_checkpoints, get_device, get_module_by_name_prefix, get_module_by_name_suffix, get_moe_layer_modules, gptqmodel_post_init, make_quant, move_to, nested_move_to, pack_model, - simple_dispatch_model, verify_model_hash, verify_sharded_model_hashes) + simple_dispatch_model, verify_model_hash, verify_sharded_model_hashes, get_model_files_size) from ..version import __version__ from ._const import CPU, CUDA_0, DEVICE, SUPPORTED_MODELS @@ -609,6 +609,9 @@ def save_quantized( """save quantized model and configs to local disk""" os.makedirs(save_dir, exist_ok=True) + pre_quantized_size_mb = get_model_files_size(self.model_name_or_path) + pre_quantized_size_gb = pre_quantized_size_mb / 1024 + # write gptqmodel tooling fingerprint to config self.quantize_config.meta_set_versionable( key=META_FIELD_QUANTIZER, @@ -706,8 +709,6 @@ def save_quantized( "We highly suggest saving quantized model using safetensors format for security reasons. Please set `use_safetensors=True` whenever possible.") torch.save(model.state_dict(), join(save_dir, model_save_name)) total_size_mb = os.path.getsize(join(save_dir, model_save_name)) / (1024 * 1024) - total_size_gb = total_size_mb / 1024 - logger.info(f"Quantized model size: {total_size_mb:.2f}MB, {total_size_gb:.2f}GB") else: # Shard checkpoint shards, index = shard_checkpoint(state_dict, max_shard_size=max_shard_size, weights_name=model_save_name) @@ -767,8 +768,6 @@ def save_quantized( torch.save(shard, join(save_dir, shard_file)) shard_size_mb = os.path.getsize(join(save_dir, shard_file)) / (1024 * 1024) total_size_mb += shard_size_mb - total_size_gb = total_size_mb / 1024 - logger.info(f"Quantized model size: {total_size_mb:.2f}MB, {total_size_gb:.2f}GB") if index is not None: index_save_name = model_save_name + ".index.json" @@ -777,6 +776,15 @@ def save_quantized( with open(index_save_path, "w", encoding="utf-8") as f: content = json.dumps(index, indent=2, sort_keys=True) + "\n" f.write(content) + + total_size_gb = total_size_mb / 1024 + size_diff_mb = pre_quantized_size_mb - total_size_mb + size_diff_gb = size_diff_mb / 1024 + percent_diff = (size_diff_mb / pre_quantized_size_mb) * 100 + logger.info(f"Pre-Quantized model size: {pre_quantized_size_mb:.2f}MB, {pre_quantized_size_gb:.2f}GB") + logger.info(f"Quantized model size: {total_size_mb:.2f}MB, {total_size_gb:.2f}GB") + logger.info(f"Size difference: {size_diff_mb:.2f}MB, {size_diff_gb:.2f}GB - {percent_diff:.2f}%") + config.quantization_config = quantize_config.to_dict() config.save_pretrained(save_dir) diff --git a/gptqmodel/utils/model.py b/gptqmodel/utils/model.py index 2b2957ed0..310f85049 100644 --- a/gptqmodel/utils/model.py +++ b/gptqmodel/utils/model.py @@ -663,3 +663,24 @@ def copy_py_files(save_dir, file_extension=".py", model_id_or_path=""): if file.rfilename.endswith(file_extension): _ = hf_hub_download(repo_id=model_id_or_path, filename=file.rfilename, local_dir=save_dir) + +def get_model_files_size(pre_quantized_model_path, file_extension=['.bin', '.safetensors', '.pth', '.pt', '.ckpt', '.h5', '.pb', '.onnx']): + if os.path.isdir(pre_quantized_model_path): + pre_quantized_size_bytes = sum( + os.path.getsize(os.path.join(pre_quantized_model_path, f)) + for f in os.listdir(pre_quantized_model_path) + if os.path.isfile(os.path.join(pre_quantized_model_path, f)) and os.path.splitext(f)[ + 1] in file_extension + ) + else: + api = HfApi() + files_data = api.list_repo_files(pre_quantized_model_path) + pre_quantized_size_bytes = 0 + for file_info in files_data: + if any(file_info.endswith(ext) for ext in file_extension): + file_metadata = api.model_info(pre_quantized_model_path, files_metadata=True) + for file_data in file_metadata.siblings: + if file_data.rfilename == file_info: + pre_quantized_size_bytes += file_data.size + pre_quantized_size_mb = pre_quantized_size_bytes / (1024 * 1024) + return pre_quantized_size_mb \ No newline at end of file From feda20df9f81504dbe30ce3bb1f759ccc9ce6c6c Mon Sep 17 00:00:00 2001 From: PZS-ModelCloud Date: Thu, 1 Aug 2024 11:50:05 +0000 Subject: [PATCH 3/4] move config damp_percent to meta --- gptqmodel/models/base.py | 4 ++-- gptqmodel/quantization/config.py | 12 +++++------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 05ae50c6d..4e6eb91b7 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -19,7 +19,7 @@ from ..nn_modules.qlinear.qlinear_qbits import QBitsQuantLinear, qbits_dtype from ..quantization import GPTQ, QuantizeConfig -from ..quantization.config import (FORMAT, FORMAT_FIELD_JSON, META_FIELD_QUANTIZER, META_QUANTIZER_GPTQMODEL, +from ..quantization.config import (FORMAT, FORMAT_FIELD_JSON, META_FIELD_QUANTIZER, META_FIELD_DAMP_PERCENT, META_QUANTIZER_GPTQMODEL, MIN_VERSION_WITH_V2, QUANTIZE_BLACK_LIST, AutoRoundQuantizeConfig) from ..utils.backend import BACKEND from ..utils.data import collate_data @@ -486,7 +486,7 @@ def tmp(_, inp, out): try: scale, zero, g_idx, duration, avg_loss, bits = gptq[name].fasterquant( - percdamp=self.quantize_config.damp_percent, + percdamp=self.quantize_config.meta_get(META_FIELD_DAMP_PERCENT), group_size=self.quantize_config.group_size, actorder=self.quantize_config.desc_act, static_groups=self.quantize_config.static_groups, diff --git a/gptqmodel/quantization/config.py b/gptqmodel/quantization/config.py index 57726992d..5a0c585fd 100644 --- a/gptqmodel/quantization/config.py +++ b/gptqmodel/quantization/config.py @@ -28,7 +28,7 @@ META_FIELD = "meta" # quantizer is the tool that did the quantization META_FIELD_QUANTIZER = "quantizer" - +META_FIELD_DAMP_PERCENT = "damp_percent" META_QUANTIZER_GPTQMODEL = "gptqmodel" # pkg names @@ -83,7 +83,6 @@ class QuantizeConfig(): # 128 offer good balance between inference speed and quantization quality group_size: int = field(default=128) # increase damp if NaN is encountred during `.quantize()` and/or increase calib dataset size - damp_percent: float = field(default=0.005) desc_act: bool = field(default=True) static_groups: bool = field(default=False) sym: bool = field(default=True) @@ -128,9 +127,6 @@ def __post_init__(self): if self.group_size != -1 and self.group_size <= 0: raise ValueError("unless equal to -1, group_size must greater then 0.") - if not (0 < self.damp_percent < 1): - raise ValueError("damp_percent must between 0 and 1.") - # validate meta if self.meta is not None: if not isinstance(self.meta, dict): @@ -138,8 +134,11 @@ def __post_init__(self): for key, value in self.meta.items(): if not isinstance(key, str): raise ValueError("Keys in the meta dictionary must be strings") + if key is META_FIELD_DAMP_PERCENT: + if not (0 < value < 1): + raise ValueError(f"{META_FIELD_DAMP_PERCENT} must between 0 and 1.") else: - self.meta = {} + self.meta = {META_FIELD_DAMP_PERCENT: 0.005} def meta_set(self, key: str, value: Any): self.meta[key] = value @@ -301,7 +300,6 @@ def to_dict(self): "static_groups": self.static_groups, "sym": self.sym, "lm_head": self.lm_head, - "damp_percent": self.damp_percent, "true_sequential": self.true_sequential, # TODO: deprecate? "model_name_or_path": self.model_name_or_path, From b28c24f282abee4be0564bc34f61d3d905e18a89 Mon Sep 17 00:00:00 2001 From: PZS-ModelCloud Date: Fri, 2 Aug 2024 01:09:36 +0000 Subject: [PATCH 4/4] revert damp_percent mod --- gptqmodel/models/base.py | 4 ++-- gptqmodel/quantization/config.py | 12 +++++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index ced023d9f..01d2262a0 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -19,7 +19,7 @@ from ..nn_modules.qlinear.qlinear_qbits import QBitsQuantLinear, qbits_dtype from ..quantization import GPTQ, QuantizeConfig -from ..quantization.config import (FORMAT, FORMAT_FIELD_JSON, META_FIELD_QUANTIZER, META_FIELD_DAMP_PERCENT, META_QUANTIZER_GPTQMODEL, +from ..quantization.config import (FORMAT, FORMAT_FIELD_JSON, META_FIELD_QUANTIZER, META_QUANTIZER_GPTQMODEL, MIN_VERSION_WITH_V2, QUANTIZE_BLACK_LIST, AutoRoundQuantizeConfig) from ..utils.backend import BACKEND from ..utils.data import collate_data @@ -485,7 +485,7 @@ def tmp(_, inp, out): try: scale, zero, g_idx, duration, avg_loss, bits = gptq[name].fasterquant( - percdamp=self.quantize_config.meta_get(META_FIELD_DAMP_PERCENT), + percdamp=self.quantize_config.damp_percent, group_size=self.quantize_config.group_size, actorder=self.quantize_config.desc_act, static_groups=self.quantize_config.static_groups, diff --git a/gptqmodel/quantization/config.py b/gptqmodel/quantization/config.py index 5a0c585fd..57726992d 100644 --- a/gptqmodel/quantization/config.py +++ b/gptqmodel/quantization/config.py @@ -28,7 +28,7 @@ META_FIELD = "meta" # quantizer is the tool that did the quantization META_FIELD_QUANTIZER = "quantizer" -META_FIELD_DAMP_PERCENT = "damp_percent" + META_QUANTIZER_GPTQMODEL = "gptqmodel" # pkg names @@ -83,6 +83,7 @@ class QuantizeConfig(): # 128 offer good balance between inference speed and quantization quality group_size: int = field(default=128) # increase damp if NaN is encountred during `.quantize()` and/or increase calib dataset size + damp_percent: float = field(default=0.005) desc_act: bool = field(default=True) static_groups: bool = field(default=False) sym: bool = field(default=True) @@ -127,6 +128,9 @@ def __post_init__(self): if self.group_size != -1 and self.group_size <= 0: raise ValueError("unless equal to -1, group_size must greater then 0.") + if not (0 < self.damp_percent < 1): + raise ValueError("damp_percent must between 0 and 1.") + # validate meta if self.meta is not None: if not isinstance(self.meta, dict): @@ -134,11 +138,8 @@ def __post_init__(self): for key, value in self.meta.items(): if not isinstance(key, str): raise ValueError("Keys in the meta dictionary must be strings") - if key is META_FIELD_DAMP_PERCENT: - if not (0 < value < 1): - raise ValueError(f"{META_FIELD_DAMP_PERCENT} must between 0 and 1.") else: - self.meta = {META_FIELD_DAMP_PERCENT: 0.005} + self.meta = {} def meta_set(self, key: str, value: Any): self.meta[key] = value @@ -300,6 +301,7 @@ def to_dict(self): "static_groups": self.static_groups, "sym": self.sym, "lm_head": self.lm_head, + "damp_percent": self.damp_percent, "true_sequential": self.true_sequential, # TODO: deprecate? "model_name_or_path": self.model_name_or_path,