Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

revert prefix-less regex matching #321

Merged
merged 1 commit into from
Aug 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions gptqmodel/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,6 @@ def collate_batch(batch):
desc_act=self.quantize_config.desc_act,
force_layer_back_to_cpu=True,
format=self.quantize_config.format,
prefix=self.layers_node,
)

self.model = model
Expand Down Expand Up @@ -437,7 +436,7 @@ def store_input_hook(_, args, kwargs):
for name in subset:
bits = self.quantize_config.bits
if self.quantize_config.dynamic_bits is not None:
key = f"{i}.{name}"
key = f"{self.layers_node}.{i}.{name}"
for pattern, d_bits in self.quantize_config.dynamic_bits.items():
if re.match(pattern, key):
bits = d_bits
Expand Down Expand Up @@ -561,7 +560,6 @@ def tmp(_, inp, out):
force_layer_back_to_cpu=force_layer_back_to_cpu,
format=self.quantize_config.format,
dynamic_bits=self.quantize_config.dynamic_bits,
prefix=self.layers_node,
)

if device_map:
Expand Down Expand Up @@ -829,7 +827,6 @@ def skip(*args, **kwargs):
format=quantize_config.format,
desc_act=quantize_config.desc_act,
pack=True,
prefix=self.layers_node,
)
model.tie_weights()

Expand Down Expand Up @@ -1183,7 +1180,6 @@ def skip(*args, **kwargs):
format=quantize_config.format,
desc_act=quantize_config.desc_act,
dynamic_bits=quantize_config.dynamic_bits,
prefix=cls.layers_node,
)
if preload_qlinear_kernel == QBitsQuantLinear:
quantize_config.runtime_format = FORMAT.QBITS
Expand Down
15 changes: 7 additions & 8 deletions gptqmodel/utils/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,6 @@ def make_quant(
sym: bool = True,
pack: bool = False,
dynamic_bits: Optional[Dict[str, int]] = None,
prefix: str = None,
) -> BaseQuantLinear:
select_quant_linear_func = select_quant_linear_with_pack if pack else select_quant_linear
QuantLinear = select_quant_linear_func(
Expand Down Expand Up @@ -152,15 +151,17 @@ def make_quant(
raise NotImplementedError(f"Unsupported module {submodule}")

bias = submodule.bias is not None
d_bits = bits

# bits be different for each layer/module
if dynamic_bits is not None:
match_name = name.removeprefix(f"{prefix}.")
# check if any dynamic bits regex match module `name`
for pattern, dm_bits in dynamic_bits.items():
if re.match(pattern, match_name):
d_bits = dm_bits
if re.match(pattern, name):
bits = dm_bits
break

new_layer = QuantLinear(
bits=d_bits,
bits=bits,
group_size=group_size,
desc_act=desc_act,
sym=sym,
Expand Down Expand Up @@ -270,7 +271,6 @@ def pack_model(
sym: bool = True,
force_layer_back_to_cpu: bool = False,
dynamic_bits=None,
prefix: str = None,
):
QuantLinear = select_quant_linear_with_pack(
bits=bits,
Expand Down Expand Up @@ -299,7 +299,6 @@ def pack_model(
desc_act=desc_act,
pack=True,
dynamic_bits=dynamic_bits,
prefix=prefix,
)
qlayers = find_layers(model, [QuantLinear])

Expand Down
24 changes: 11 additions & 13 deletions tests/test_quant_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ def test_diff_batch(self):
quantize_config = QuantizeConfig(
bits=4,
group_size=128,
format=FORMAT.GPTQ,
)

model = GPTQModel.from_pretrained(
Expand All @@ -67,7 +66,6 @@ def test_diff_batch(self):

model = GPTQModel.from_quantized(
tmp_dir,
device_map="auto",
)

batch_size_1_ppl = self.calculate_avg_ppl(model, self.tokenizer)
Expand All @@ -77,8 +75,7 @@ def test_diff_batch(self):
quantize_config=quantize_config,
)

model.quantize(self.calibration_dataset, batch_size=256)

model.quantize(self.calibration_dataset, batch_size=4)
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_quantized(
tmp_dir,
Expand All @@ -88,32 +85,32 @@ def test_diff_batch(self):

model = GPTQModel.from_quantized(
tmp_dir,
device_map="auto",
)

batch_size_256_ppl = self.calculate_avg_ppl(model, self.tokenizer)

del model

assert abs(batch_size_1_ppl - batch_size_256_ppl) < 0.1

def test_dynamic_bits(self):
# layer starting point of 0
dynamic_bits = {
r"^18\..*gate.*": 8,
r"^19\..*gate.*": 8,
r"^20\..*gate.*": 8,
r"^21\..*gate.*": 8,
# `.*\.` matches the layers_node prefix
r".*\.18\..*gate.*": 8, # match layer 18 (index start at 0) gate module
r".*\.19\..*gate.*": 8, # match layer 19 (index start at 0) gate module
r".*\.20\..*gate.*": 8, # match layer 21 (index start at 0) gate module
r".*\.21\..*gate.*": 8, # match layer 22 (index start at 0) gate module
}
quantize_config = QuantizeConfig(
bits=4,
dynamic_bits=dynamic_bits,
group_size=128,
format=FORMAT.GPTQ,
)
model = GPTQModel.from_pretrained(
self.NATIVE_MODEL_ID,
quantize_config=quantize_config,
)
model.quantize(self.calibration_dataset, batch_size=256)
model.quantize(self.calibration_dataset, batch_size=4)

with tempfile.TemporaryDirectory() as tmp_dir:
model.save_quantized(
Expand All @@ -124,9 +121,10 @@ def test_dynamic_bits(self):

model = GPTQModel.from_quantized(
tmp_dir,
device_map="auto",
backend=BACKEND.TRITON,
)

dynamic_bits_ppl = self.calculate_avg_ppl(model, self.tokenizer)

del model
assert dynamic_bits_ppl < 10
Loading