Expose hqq through int4_weight_only API

Summary: att, this is a follow up for pytorch#605 to make hqq available in quantize_ API `quantize_(model, int4_weight_only(group_size, use_hqq=True)` Test Plan: python generate.py --compile --quantization int4wo-hqq-64 --precision bfloat16 Average tokens/sec: 195.24 Average Bandwidth: 729.40 GB/s Peak Memory Usage: 5.09 GB Model Size: 3.74 GB python eval.py --compile --quantization int4wo-hqq-64 --precision bfloat16 wikitext: {'word_perplexity,none': 12.823631773497512, 'word_perplexity_stderr,none': 'N/A', 'byte_perplexity,none': 1.611400903914048, 'byte_perplexity_stderr,none': 'N/A', 'bits_per_byte,none': 0.6883154699192412, 'bits_per_byte_stderr,none': 'N/A', 'alias': 'wikitext'} Reviewers: Subscribers: Tasks: Tags:
jerryzh168 · Sep 4, 2024 · f82af17 · f82af17
1 parent 1a31ebd
commit f82af17
Showing 1 changed file with 2 additions and 2 deletions.
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -419,7 +419,7 @@ def int8_dynamic_activation_int4_weight(group_size=32):
     return _get_linear_subclass_inserter(apply_int8_dynamic_activation_int4_weight_quant, group_size=group_size)
 
 
-def int4_weight_only(group_size=128, layout_type=TensorCoreTiledLayoutType(inner_k_tiles=8)):
+def int4_weight_only(group_size=128, layout_type=TensorCoreTiledLayoutType(inner_k_tiles=8), use_hqq=False):
     """
     Applies uint4 weight-only asymmetric per-group quantization to linear layers, using
     "tensor_core_tiled" layout for speedup with tinygemm kernel
@@ -453,7 +453,7 @@ def apply_int4_weight_only_quant(weight):
         preserve_zero = False
         zero_point_dtype = torch.bfloat16
         zero_point_domain = ZeroPointDomain.FLOAT
-        return to_affine_quantized_intx(weight, mapping_type, block_size, target_dtype, quant_min, quant_max, eps, zero_point_dtype=zero_point_dtype, preserve_zero=preserve_zero, zero_point_domain=zero_point_domain, layout_type=layout_type)
+        return to_affine_quantized_intx(weight, mapping_type, block_size, target_dtype, quant_min, quant_max, eps, zero_point_dtype=zero_point_dtype, preserve_zero=preserve_zero, zero_point_domain=zero_point_domain, layout_type=layout_type, use_hqq=use_hqq)
 
     return _get_linear_subclass_inserter(apply_int4_weight_only_quant)