diff --git a/bitsandbytes/backends/cpu.py b/bitsandbytes/backends/cpu.py index a5e123e62..80b6c241e 100644 --- a/bitsandbytes/backends/cpu.py +++ b/bitsandbytes/backends/cpu.py @@ -136,6 +136,7 @@ def quantize_4bit( quant_storage=torch.uint8, ) -> Tuple[torch.Tensor, QuantState]: assert_on_cpu([A, absmax, out]) + assert quant_storage == torch.uint8, "CPU backend only supports uint8 quant_storage" return quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type) def dequantize_4bit( diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py index 078b81680..ab881c6dd 100644 --- a/bitsandbytes/backends/cpu_xpu_common.py +++ b/bitsandbytes/backends/cpu_xpu_common.py @@ -343,6 +343,8 @@ def quantize_4bit_impl( ) if ipex_cpu and _ipex_cpu_version_prereq(2, 2) and input_shape[0] % blocksize == 0: + # lowp_mode: lowest precision for computation + lowp_mode = ipex_cpu.quantization.WoqLowpMode.BF16 state.op_context = torch.ops.ipex_prepack.weight_only_qlinear_prepack( out.reshape([input_shape[0], input_shape[1] // 2]), ipex_cpu.quantization.WoqWeightDtype.NF4, @@ -353,8 +355,8 @@ def quantize_4bit_impl( None, # g_idx None, # batch_size blocksize, - int(ipex_cpu.quantization.WoqLowpMode.BF16), - -1, # act_quant_mode + int(lowp_mode), + -1, # act_quant_mode. -1 means don't quant activation ) return out, state