diff --git a/README.md b/README.md index aa88cf84f99..1b0a520469c 100644 --- a/README.md +++ b/README.md @@ -219,11 +219,11 @@ outputs = model.generate(inputs) You can also load the low-bit model quantized by GPTQ/AWQ/RTN/AutoRound algorithm. ```python from transformers import AutoTokenizer -from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig +from intel_extension_for_transformers.transformers import AutoModelForCausalLM, GPTQConfig # Download Hugging Face GPTQ/AWQ model or use local quantize model model_name = "PATH_TO_MODEL" # local path to model -woq_config = WeightOnlyQuantConfig(use_gptq=True) # use_awq=True for AWQ; use_autoround=True for AutoRound +woq_config = GPTQConfig(bits=4) # use AwqConfig for AWQ models, and AutoRoundConfig for AutoRound models prompt = "Once upon a time, a little girl" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) diff --git a/docs/weightonlyquant.md b/docs/weightonlyquant.md index fad79d9095c..467dac491f8 100644 --- a/docs/weightonlyquant.md +++ b/docs/weightonlyquant.md @@ -3,7 +3,7 @@ Weight Only Quantization (WOQ) 1. [Introduction](#introduction) -2. [Supported Framework Model Matrix](#supported-framework-model-matrix) +2. [Supported Algorithms](#supported-algorithms) 3. [Examples For CPU/CUDA](#examples-for-cpu-and-cuda) @@ -12,40 +12,65 @@ Weight Only Quantization (WOQ) ## Introduction As large language models (LLMs) become more prevalent, there is a growing need for new and improved quantization methods that can meet the computational demands of these modern architectures while maintaining the accuracy. Compared to [normal quantization](https://github.com/intel/intel-extension-for-transformers/blob/main/docs/quantization.md) like W8A8, weight only quantization is probably a better trade-off to balance the performance and the accuracy, since we will see below that the bottleneck of deploying LLMs is the memory bandwidth and normally weight only quantization could lead to better accuracy. -## Supported Framework Model Matrix +## Supported Algorithms -| Algorithms/Framework | PyTorch | LLM Runtime | -|:--------------:|:----------:|:----------:| -| RTN | ✔ | ✔ | -| AWQ | ✔ | stay tuned | -| TEQ | ✔ | stay tuned | -| GPTQ | ✔ | ✔ | +| Support Device | Rtn | Awq | Teq | GPTQ | AutoRound | +|:--------------:|:----------:|:----------:|:----------:|:----:|:----:| +| Intel CPU | ✔ | ✔ | ✔ | ✔ | ✔ | +| Intel GPU | ✔ | stay tuned | stay tuned | stay tuned | stay tuned | -| Support Device | RTN | AWQ | TEQ | GPTQ | -|:--------------:|:----------:|:----------:|:----------:|:----:| -| CPU | ✔ | ✔ | ✔ | ✔ | -| GPU | ✔ | stay tuned | stay tuned | stay tuned | -> **RTN:** A quantification method that we can think of very intuitively. It does not require additional datasets and is a very fast quantization method. Generally speaking, RTN will convert the weight into a uniformly distributed integer data type, but some algorithms, such as Qlora, propose a non-uniform NF4 data type and prove its theoretical optimality. +**RTN**[[1\]](https://github.com/intel/intel-extension-for-transformers/blob/548c13ed2e19cde91729530ca26c3b875c1b3d10/docs/weightonlyquant.md#1)(★★★): Rounding to Nearest (RTN) is an intuitively simple method that rounds values to the nearest integer. It boasts simplicity, requiring no additional datasets, and offers fast quantization. Besides, it could be easily applied in other datatype like NF4(non-uniform). Typically, it performs well on configurations such as W4G32 or W8, but worse than advanced algorithms at lower precision level. -> **GPTQ:** A new one-shot weight quantization method based on approximate second-order information, that is both highly-accurate and highly efficient. The weights of each column are updated based on the fixed-scale pseudo-quantization error and the inverse of the Hessian matrix calculated from the activations. The updated columns sharing the same scale may generate a new max/min value, so the scale needs to be saved for restoration. -> **AWQ:** Proved that protecting only 1% of salient weights can greatly reduce quantization error. the salient weight channels are selected by observing the distribution of activation and weight per channel. The salient weights are also quantized after multiplying a big scale factor before quantization for preserving. +**Teq**[[2\]](https://github.com/intel/intel-extension-for-transformers/blob/548c13ed2e19cde91729530ca26c3b875c1b3d10/docs/weightonlyquant.md#4)(★★★): To our knowledge, it is the first trainable equivalent ransformation method (summited for peer review in 202306). However, it requires more memory than other methods as model-wise loss is used and the equivalent transformation imposes certain requirements on model architecture. -> **TEQ:** A trainable equivalent transformation that preserves the FP32 precision in weight-only quantization. It is inspired by AWQ while providing a new solution to search for the optimal per-channel scaling factor between activations and weights. +**GPTQ**[[2\]](https://github.com/intel/intel-extension-for-transformers/blob/548c13ed2e19cde91729530ca26c3b875c1b3d10/docs/weightonlyquant.md#2)(★★★★): GPTQ is a widely adopted method based on the Optimal Brain Surgeon. It quantizes weight block by block and fine-tunes the remaining unquantized ones to mitigate quantization errors. Occasionally, Non-positive semidefinite matrices may occur, necessitating adjustments to hyperparameters. + + + +**Awq**[[4\]](https://github.com/intel/intel-extension-for-transformers/blob/548c13ed2e19cde91729530ca26c3b875c1b3d10/docs/weightonlyquant.md#3)(★★★★): AWQ is a popular method that explores weight min-max values and equivalent transformations in a handcrafted space. While effective, the equivalent transformation imposes certain requirements on model architecture, limiting its applicability to broader models or increasing engineering efforts. + + + +**AutoRound**[[5\]](https://github.com/intel/intel-extension-for-transformers/blob/548c13ed2e19cde91729530ca26c3b875c1b3d10/docs/weightonlyquant.md#5)(★★★★☆): AutoRound utilizes sign gradient descent to optimize rounding values and minmax values of weights within just 200 steps, showcasing impressive performance compared to recent methods like GPTQ/AWQ. Additionally, it offers hypeparameters tuning compatibility to further enhance performance. However, due to its reliance on gradient backpropagation, currently it is not quite fit for backends like ONNX. + +### references +[1] +Gunho Park, Baeseong Park, Se Jung Kwon, Byeongwook Kim, Youngjoo Lee, and Dongsoo Lee. +nuqmm: Quantized matmul for efficient inference of large-scale generative language models. +arXiv preprint arXiv:2206.09557, 2022. + +[2] +Cheng, W., Cai, Y., Lv, K & Shen, H. (2023). +TEQ: Trainable Equivalent Transformation for Quantization of LLMs. +arXiv preprint arXiv:2310.10944. + +[3] +Frantar, Elias, et al. "Gptq: Accurate post-training quantization for generative pre-trained transformers." arXiv preprint arXiv:2210.17323 (2022). + +[4] +Lin, Ji, et al.(2023). +AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration. +arXiv preprint arXiv:2306.00978. + +[5] +Cheng, W., Zhang, W., Shen, H., Cai, Y., He, X., & Lv, K. (2023). +Optimize weight rounding via signed gradient descent for the quantization of llms. +arXiv preprint arXiv:2309.05516. ## Examples For CPU AND CUDA -Our motivation is improve CPU support for weight only quantization, since `bitsandbytes` only support CUDA GPU device. We have extended the `from_pretrained` function so that `quantization_config` can accept [`WeightOnlyQuantConfig`](https://github.com/intel/intel-extension-for-transformers/blob/main/intel_extension_for_transformers/transformers/utils/quantization_config.py#L28) to implement conversion on the CPU. We not only support PyTorch but also provide LLM Runtime backend based cpp programming language. Here are the example codes. +Our motivation is to improve CPU support for weight only quantization, since `bitsandbytes`, `auto-gptq`, `autoawq` only support CUDA GPU device. We have extended the `from_pretrained` function so that `quantization_config` can accept [`RtnConfig`](https://github.com/intel/intel-extension-for-transformers/blob/main/intel_extension_for_transformers/transformers/utils/config.py#L608), [`AwqConfig`](https://github.com/intel/intel-extension-for-transformers/blob/main/intel_extension_for_transformers/transformers/utils/config.py#L793), [`TeqConfig`](https://github.com/intel/intel-extension-for-transformers/blob/main/intel_extension_for_transformers/transformers/utils/config.py#L28), [`GPTQConfig`](https://github.com/intel/intel-extension-for-transformers/blob/main/intel_extension_for_transformers/transformers/utils/config.py#L855), [`AutoroundConfig`](https://github.com/intel/intel-extension-for-transformers/blob/main/intel_extension_for_transformers/transformers/utils/config.py#L912) to implement conversion on the CPU. We not only support PyTorch but also provide LLM Runtime backend based cpp programming language. Here are the example codes. ### Example for CPU device -4-bit/8-bit inference with `WeightOnlyQuantConfig` on CPU device. +4-bit/8-bit inference with `RtnConfig`, `AwqConfig`, `TeqConfig`, `GPTQConfig`, `AutoRoundConfig` on CPU device. ```bash -cd intel_extension_for_transformers/llm/runtime/graph -from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig +cd examples/huggingface/pytorch/text-generation/quantization +from intel_extension_for_transformers.transformers import AutoModelForCausalLM, RtnConfig model_name_or_path = "Intel/neural-chat-7b-v3-3" # weight_dtype: int8/int4, compute_dtype: int8/fp32 -woq_config = WeightOnlyQuantConfig(weight_dtype="int4", compute_dtype="int8") +woq_config = RtnConfig(bits=4, compute_dtype="int8") model = AutoModelForCausalLM.from_pretrained( model_name_or_path, quantization_config=woq_config, @@ -82,7 +107,7 @@ gen_ids = woq_model.generate(input_ids, max_new_tokens=32, **generate_kwargs) gen_text = tokenizer.batch_decode(gen_ids, skip_special_tokens=True) print(gen_text) ``` -`load_in_4bit` and `load_in_8bit` both support on CPU and CUDA GPU device. If device set to use GPU, the BitsAndBytesConfig will be used, if the device set to use CPU, the WeightOnlyQuantConfig will be used. +`load_in_4bit` and `load_in_8bit` both support on CPU and CUDA GPU device. If device set to use GPU, the BitsAndBytesConfig will be used, if the device set to use CPU, the RtnConfig will be used. ```bash from intel_extension_for_transformers.transformers import AutoModelForCausalLM woq_model = AutoModelForCausalLM.from_pretrained( @@ -160,7 +185,6 @@ pip install intel-extension-for-transformers import intel_extension_for_pytorch as ipex from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM from transformers import AutoTokenizer -import torch device = "xpu" model_name = "Qwen/Qwen-7B" @@ -171,7 +195,7 @@ inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(device) qmodel = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True, device_map="xpu", trust_remote_code=True) # optimize the model with ipex, it will improve performance. -qmodel = ipex.optimize_transformers(qmodel, inplace=True, dtype=torch.float16, quantization_config={}, device="xpu") +qmodel = ipex.optimize_transformers(qmodel, inplace=True, dtype=torch.float16, woq=True, device="xpu") output = user_model.generate(inputs) ``` @@ -195,7 +219,7 @@ model.save_pretrained("saved_dir") loaded_model = AutoModelForCausalLM.from_pretrained("saved_dir", trust_remote_code=True) # Before executed the loaded model, you can call ipex.optimize_transformers function. -loaded_model = ipex.optimize_transformers(loaded_model, inplace=True, dtype=torch.float16, woq=True, device="xpu") +loaded_model = ipex.optimize_transformers(loaded_model, inplace=True, dtype=torch.float16, quantization_config={}, device="xpu") output = loaded_model.generate(inputs) diff --git a/examples/huggingface/neural_speed/perplexity/perplexity.py b/examples/huggingface/neural_speed/perplexity/perplexity.py index 4975029c4da..698f30ced48 100644 --- a/examples/huggingface/neural_speed/perplexity/perplexity.py +++ b/examples/huggingface/neural_speed/perplexity/perplexity.py @@ -56,7 +56,7 @@ def get_ppl(sum_nll, sum_nll2, cnt: int): def perplexity(model_name, dataset_name, **kwargs): import datasets - from intel_extension_for_transformers.transformers import (AutoModelForCausalLM, WeightOnlyQuantConfig) + from intel_extension_for_transformers.transformers import (AutoModelForCausalLM, RtnConfig) from transformers import AutoTokenizer, AutoConfig model_name = try_resolve_dir(model_name) dataset_name = try_resolve_dir(dataset_name) @@ -107,7 +107,7 @@ def perplexity(model_name, dataset_name, **kwargs): for k in kwargs if k in ['use_cache', 'compute_dtype', 'weight_dtype', 'scale_dtype', 'group_size', 'use_ggml'] } - woq_config = WeightOnlyQuantConfig(**woq_kwargs) + woq_config = RtnConfig(**woq_kwargs) model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True) model_kwargs = {k: kwargs[k] for k in kwargs if k in ['n_keep', 'shift_roped_k', 'memory_dtype']} diff --git a/examples/huggingface/neural_speed/run_inference.py b/examples/huggingface/neural_speed/run_inference.py index 969b3006608..f8dd4563c6e 100644 --- a/examples/huggingface/neural_speed/run_inference.py +++ b/examples/huggingface/neural_speed/run_inference.py @@ -16,7 +16,7 @@ from pathlib import Path from typing import List, Optional from transformers import AutoTokenizer,TextStreamer -from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig +from intel_extension_for_transformers.transformers import AutoModelForCausalLM, RtnConfig def main(args_in: Optional[List[str]] = None) -> None: parser = argparse.ArgumentParser(description="Convert a PyTorch model to a NE compatible file") parser.add_argument("--model_path",type=Path, @@ -32,7 +32,7 @@ def main(args_in: Optional[List[str]] = None) -> None: parser.add_argument("--max_new_tokens", type=int, help="max_new_tokens", default=300) args = parser.parse_args(args_in) model_name = args.model_path - woq_config = WeightOnlyQuantConfig(load_in_4bit=True, use_quant=args.not_quant, + woq_config = RtnConfig(load_in_4bit=True, use_quant=args.not_quant, weight_dtype=args.weight_dtype, compute_dtype=args.compute_dtype, group_size=args.group_size, use_gptq=args.use_gptq) prompt = args.prompt tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) diff --git a/examples/huggingface/pytorch/code-generation/quantization/run_generation.py b/examples/huggingface/pytorch/code-generation/quantization/run_generation.py index ad2304f2c83..543c4f04c9a 100644 --- a/examples/huggingface/pytorch/code-generation/quantization/run_generation.py +++ b/examples/huggingface/pytorch/code-generation/quantization/run_generation.py @@ -13,9 +13,13 @@ from optimum.utils import NormalizedConfigManager from intel_extension_for_transformers.transformers import ( MixedPrecisionConfig, - WeightOnlyQuantConfig, SmoothQuantConfig, BitsAndBytesConfig, + RtnConfig, + AwqConfig, + TeqConfig, + GPTQConfig, + AutoRoundConfig, ) from intel_extension_for_transformers.transformers import ( AutoModelForCausalLM, @@ -27,7 +31,7 @@ parser.add_argument( "--model", nargs="?", default="bigcode/starcoderbase", const="bigcode/starcoderbase" ) -parser.add_argument("--trust_remote_code", default=False) +parser.add_argument("--trust_remote_code", action="store_true") parser.add_argument("--_commit_hash", default=None, type=str) parser.add_argument("--dataset", nargs="?", default="mbpp", const="mbpp") parser.add_argument("--dtype", type=str, default="int8") @@ -60,27 +64,24 @@ # ============SmoothQuant configs============== parser.add_argument("--sq", action="store_true") parser.add_argument("--alpha", default="0.5", help="Smooth quant parameter.") -# ============WeightOnlyQuant configs============ -parser.add_argument("--bitsandbytes", action="store_true") -parser.add_argument("--load_in_4bit", action="store_true") -parser.add_argument("--load_in_8bit", action="store_true") +# ============WeightOnlyQuant configs=============== parser.add_argument("--woq", action="store_true") parser.add_argument( "--woq_algo", default="RTN", choices=["RTN", "AWQ", "TEQ", "GPTQ", "AUTOROUND"], - help="Weight-only parameter.", + help="Weight-only algorithm.", ) parser.add_argument( - "--woq_compute_dtype", - type=str, - default="fp32", - choices=["fp32", "bf16", "int8"], + "--bits", + type=int, + default=8, + choices=[4, 8], ) parser.add_argument( - "--woq_weight_dtype", + "--weight_dtype", type=str, - default="int4_clip", + default="int8", choices=[ "int8", "int4_clip", @@ -93,57 +94,68 @@ ], ) parser.add_argument( - "--woq_scale_dtype", + "--scale_dtype", type=str, default="fp32", choices=["fp32", "fp8"], ) -parser.add_argument("--woq_group_size", type=int, default=32) -parser.add_argument("--woq_scheme", default="sym") +parser.add_argument( + "--compute_dtype", + type=str, + default="fp32", + choices=["fp32", "bf16", "int8"], +) +parser.add_argument("--group_size", type=int, default=32) +parser.add_argument("--scheme", default="sym") # ============GPTQ configs============== parser.add_argument( - "--gptq_actorder", + "--desc_act", action="store_true", help="Whether to apply the activation order GPTQ heuristic.", ) parser.add_argument( - "--gptq_percdamp", + "--damp_percent", type=float, default=0.01, help="Percent of the average Hessian diagonal to use for dampening.", ) parser.add_argument( - "--gptq_block_size", + "--blocksize", type=int, default=128, help="Block size. sub weight matrix size to run GPTQ.", ) parser.add_argument( - "--gptq_nsamples", type=int, default=128, help="Number of calibration data samples." -) -parser.add_argument( - "--gptq_use_max_length", - action="store_true", - help="Set all sequence length to be same length of args.gptq_pad_max_length", + "--nsamples", type=int, default=128, help="Number of calibration data samples." ) parser.add_argument( - "--gptq_pad_max_length", + "--max_input_length", type=int, default=2048, help="Calibration dataset sequence max length, this should align with your model config", ) -parser.add_argument('--gptq_static_groups', action='store_true', help='Use determined group to do quantization') +parser.add_argument( + "--static_groups", + action="store_true", + help="Use determined group to do quantization", +) # ============AUTOROUND configs============== parser.add_argument( - "--autoround_nsamples", - type=int, default=512, - help="Number of calibration data samples.", + "--lr", + type=float, + default=0.0025, + help="learning rate, if None, it will be set to 1.0/iters automatically", ) parser.add_argument( - "--autoround_seq_len", - type=int, - default=2048, - help="Calibration dataset sequence max length, this should align with your model config", + "--minmax_lr", + type=float, + default=0.0025, + help="minmax learning rate, if None,it will beset to be the same with lr", +) +parser.add_argument( + "--use_quant_input", + action="store_true", + help="whether to use the output of quantized block to tune the next block", ) # ============Harness configs============ parser.add_argument("--tasks", default=None, help="Evaluation tasks") @@ -272,58 +284,79 @@ calib_iters=args.calib_iters, ) elif args.woq: - if args.woq_algo == "GPTQ": - algorithm_args = { - "act_order": args.gptq_actorder, - "percdamp": args.gptq_percdamp, - "block_size": args.gptq_block_size, - "nsamples": args.gptq_nsamples, - "use_max_length": args.gptq_use_max_length, - "pad_max_length": args.gptq_pad_max_length, - "static_groups": args.gptq_static_groups, - } - quantization_config = WeightOnlyQuantConfig( - compute_dtype=args.woq_compute_dtype, - scale_dtype=args.woq_scale_dtype, - weight_dtype=args.woq_weight_dtype, - scheme=args.woq_scheme, - group_size=args.woq_group_size, - algorithm=args.woq_algo, + if args.woq_algo == "RTN": + quantization_config = RtnConfig( tokenizer=tokenizer, - algorithm_args=algorithm_args, - calib_dataset=args.dataset + bits=args.bits, + scheme=args.scheme, + group_size=args.group_size, + compute_dtype=args.compute_dtype, + scale_dtype=args.scale_dtype, + weight_dtype=args.weight_dtype, + ) + elif args.woq_algo == "AWQ": + quantization_config = AwqConfig( + tokenizer=tokenizer, + dataset=args.dataset, + bits=args.bits, + zero_point=False if args.scheme == "sym" else True, + group_size=args.group_size, + max_input_length=args.max_input_length, + compute_dtype=args.compute_dtype, + scale_dtype=args.scale_dtype, + weight_dtype=args.weight_dtype, + calib_iters=args.calib_iters, + ) + elif args.woq_algo == "TEQ": + quantization_config = TeqConfig( + tokenizer=tokenizer, + dataset=args.dataset, + bits=args.bits, + scheme=args.scheme, + group_size=args.group_size, + max_input_length=args.max_input_length, + compute_dtype=args.compute_dtype, + scale_dtype=args.scale_dtype, + weight_dtype=args.weight_dtype, + calib_iters=args.calib_iters, + ) + elif args.woq_algo == "GPTQ": + quantization_config = GPTQConfig( + tokenizer=tokenizer, + dataset=args.dataset, + bits=args.bits, + desc_act=args.desc_act, + damp_percent=args.damp_percent, + sym=True if args.scheme == "sym" else False, + blocksize=args.blocksize, + nsamples=args.nsamples, + static_groups=args.static_groups, + group_size=args.group_size, + max_input_length=args.max_input_length, + compute_dtype=args.compute_dtype, + scale_dtype=args.scale_dtype, + weight_dtype=args.weight_dtype, + calib_iters=args.calib_iters, ) elif args.woq_algo == "AUTOROUND": - algorithm_args = { - "n_samples": args.autoround_nsamples, - "amp": False, - "seq_len": args.autoround_seq_len, - "iters": args.calib_iters, - "scale_dtype": "fp32", - "device": "cpu", - "export_args": {"format": "itrex", "inplace": False} - } - quantization_config = WeightOnlyQuantConfig( - compute_dtype=args.woq_compute_dtype, - scale_dtype=args.woq_scale_dtype, - weight_dtype=args.woq_weight_dtype, - scheme=args.woq_scheme, - group_size=args.woq_group_size, - algorithm=args.woq_algo, + quantization_config = AutoRoundConfig( tokenizer=tokenizer, - algorithm_args=algorithm_args, - calib_dataset=args.dataset + dataset=args.dataset, + bits=args.bits, + sym=True if args.scheme == "sym" else False, + nsamples=args.nsamples, + group_size=args.group_size, + compute_dtype=args.compute_dtype, + scale_dtype=args.scale_dtype, + weight_dtype=args.weight_dtype, + calib_iters=args.calib_iters, + calib_len=args.calib_len, + lr=args.lr, + minmax_lr=args.minmax_lr, + use_quant_input=args.use_quant_input, ) else: - quantization_config = WeightOnlyQuantConfig( - weight_dtype=args.woq_weight_dtype, - scale_dtype=args.woq_scale_dtype, - group_size=args.woq_group_size, - scheme=args.woq_scheme, - algorithm=args.woq_algo, - tokenizer=tokenizer, - calib_dataset=args.dataset - ) # default is A32W4G32 + assert False, "Please set the correct '--woq_algo'" # bitsandbytes elif args.bitsandbytes: # GPU device is need for `load_in_4bit` and `load_in_8bit`. diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh index c2c7d2842fd..15e7b640b12 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh @@ -113,32 +113,39 @@ function run_benchmark { model_name_or_path="mosaicml/mpt-7b-chat" elif [ "${topology}" = "chatglm3_6b" ]; then model_name_or_path="THUDM/chatglm3-6b" - extra_cmd=$extra_cmd" --trust_remote_code True" + extra_cmd=$extra_cmd" --trust_remote_code" elif [ "${topology}" = "chatglm2_6b" ]; then model_name_or_path="THUDM/chatglm2-6b" - extra_cmd=$extra_cmd" --trust_remote_code True" + extra_cmd=$extra_cmd" --trust_remote_code" elif [ "${topology}" = "chatglm_6b" ]; then model_name_or_path="THUDM/chatglm-6b" - extra_cmd=$extra_cmd" --trust_remote_code True" + extra_cmd=$extra_cmd" --trust_remote_code" pip install transformers==4.33 elif [ "${topology}" = "falcon_7b" ]; then model_name_or_path="tiiuae/falcon-7b-instruct" pip install transformers==4.33 elif [ "${topology}" = "baichuan_7b" ]; then model_name_or_path="baichuan-inc/Baichuan-7B" - extra_cmd=$extra_cmd" --trust_remote_code True" + extra_cmd=$extra_cmd" --trust_remote_code" + pip install transformers==4.33 elif [ "${topology}" = "baichuan_13b" ]; then model_name_or_path="baichuan-inc/Baichuan-13B-Base" - extra_cmd=$extra_cmd" --trust_remote_code True" + extra_cmd=$extra_cmd" --trust_remote_code" + extra_cmd=$extra_cmd" --_commit_hash 14d5b0e204542744900f6fb52422c6d633bdcb00" + pip install transformers==4.33 elif [ "${topology}" = "baichuan2_7b" ]; then model_name_or_path="baichuan-inc/Baichuan2-7B-Base" - extra_cmd=$extra_cmd" --trust_remote_code True" + extra_cmd=$extra_cmd" --trust_remote_code" + pip install transformers==4.33 elif [ "${topology}" = "baichuan2_13b" ]; then model_name_or_path="baichuan-inc/Baichuan2-13B-Base" - extra_cmd=$extra_cmd" --trust_remote_code True" + extra_cmd=$extra_cmd" --trust_remote_code" + pip install transformers==4.33 elif [ "${topology}" = "qwen_7b" ]; then model_name_or_path="Qwen/Qwen-7B" - extra_cmd=$extra_cmd" --trust_remote_code True" + extra_cmd=$extra_cmd" --trust_remote_code" + extra_cmd=$extra_cmd" --_commit_hash f7bc352f27bb1c02ee371a4576942a7d96c8bb97" + pip install transformers==4.35.2 elif [ "${topology}" = "mistral_7b" ]; then model_name_or_path="Intel/neural-chat-7b-v3" elif [ "${topology}" = "phi_1b" ]; then @@ -155,16 +162,15 @@ function run_benchmark { elif [ "${topology}" = "gpt_j_woq_bab" ]; then extra_cmd=$extra_cmd" --bitsandbytes" elif [ "${topology}" = "gpt_j_woq_load4bit" ]; then - extra_cmd=$extra_cmd" --load_in_4bit True" + extra_cmd=$extra_cmd" --load_in_4bit " elif [ "${topology}" = "gpt_j_woq_load8bit" ]; then - extra_cmd=$extra_cmd" --load_in_8bit True" + extra_cmd=$extra_cmd" --load_in_8bit " elif [ "${topology}" = "gpt_j_mp" ]; then extra_cmd=$extra_cmd" --mixed_precision" elif [ "${topology}" = "llama2_7b_int4_gptq" ]; then model_name_or_path="meta-llama/Llama-2-7b-hf" - extra_cmd=$extra_cmd" --woq --woq_weight_dtype int4_clip --woq_compute_dtype fp32" - extra_cmd=$extra_cmd" --woq_algo "GPTQ" --gptq_actorder --gptq_block_size 128 --gptq_pad_max_length 2048 --gptq_use_max_length" - pip install transformers==4.35.2 + extra_cmd=$extra_cmd" --woq --bits 4 --weight_dtype int4_clip --compute_dtype fp32 --scheme asym " + extra_cmd=$extra_cmd" --woq_algo "GPTQ" --desc_act --blocksize 128 --max_input_length 2048 " else extra_cmd=$extra_cmd" --int8" fi diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py index fc2a89c42de..4886974d5d1 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py @@ -15,9 +15,13 @@ from optimum.intel.generation.modeling import TSModelForCausalLM from intel_extension_for_transformers.transformers import ( MixedPrecisionConfig, - WeightOnlyQuantConfig, SmoothQuantConfig, BitsAndBytesConfig, + RtnConfig, + AwqConfig, + TeqConfig, + GPTQConfig, + AutoRoundConfig, ) parser = argparse.ArgumentParser() @@ -96,10 +100,16 @@ "--woq_algo", default="RTN", choices=["RTN", "AWQ", "TEQ", "GPTQ", "AUTOROUND"], - help="Weight-only parameter.", + help="Weight-only algorithm.", ) parser.add_argument( - "--woq_weight_dtype", + "--bits", + type=int, + default=8, + choices=[4, 8], +) +parser.add_argument( + "--weight_dtype", type=str, default="int8", choices=[ @@ -114,77 +124,84 @@ ], ) parser.add_argument( - "--woq_scale_dtype", + "--scale_dtype", type=str, default="fp32", choices=["fp32", "fp8"], ) parser.add_argument( - "--woq_compute_dtype", + "--compute_dtype", type=str, default="fp32", choices=["fp32", "bf16", "int8"], ) -parser.add_argument("--woq_group_size", type=int, default=32) -parser.add_argument("--woq_scheme", default="sym") +parser.add_argument("--group_size", type=int, default=32) +parser.add_argument("--scheme", default="sym") +# ============GPTQ configs============== parser.add_argument( - "--gptq_actorder", + "--desc_act", action="store_true", help="Whether to apply the activation order GPTQ heuristic.", ) parser.add_argument( - "--gptq_percdamp", + "--damp_percent", type=float, default=0.01, help="Percent of the average Hessian diagonal to use for dampening.", ) parser.add_argument( - "--gptq_block_size", + "--blocksize", type=int, default=128, help="Block size. sub weight matrix size to run GPTQ.", ) parser.add_argument( - "--gptq_nsamples", type=int, default=128, help="Number of calibration data samples." + "--nsamples", type=int, default=128, help="Number of calibration data samples." ) parser.add_argument( - "--gptq_use_max_length", - action="store_true", - help="Set all sequence length to be same length of args.gptq_pad_max_length", -) -parser.add_argument( - "--gptq_pad_max_length", + "--max_input_length", type=int, default=2048, help="Calibration dataset sequence max length, this should align with your model config", ) -parser.add_argument('--gptq_static_groups', action='store_true', help='Use determined group to do quantization') +parser.add_argument( + "--static_groups", + action="store_true", + help="Use determined group to do quantization", +) # ============AUTOROUND configs============== parser.add_argument( - "--autoround_nsamples", - type=int, default=512, - help="Number of calibration data samples.", + "--lr", + type=float, + default=0.0025, + help="learning rate, if None, it will be set to 1.0/iters automatically", ) parser.add_argument( - "--autoround_seq_len", - type=int, - default=2048, - help="Calibration dataset sequence max length, this should align with your model config", + "--minmax_lr", + type=float, + default=0.0025, + help="minmax learning rate, if None,it will beset to be the same with lr", +) +parser.add_argument( + "--use_quant_input", + action="store_true", + help="whether to use the output of quantized block to tune the next block", ) + # ============BitsAndBytes configs============== parser.add_argument("--bitsandbytes", action="store_true") # ============AutoModel parameters============== -parser.add_argument("--load_in_4bit", type=bool, default=False) -parser.add_argument("--load_in_8bit", type=bool, default=False) +parser.add_argument("--load_in_4bit", action="store_true") +parser.add_argument("--load_in_8bit", action="store_true") parser.add_argument("--_commit_hash", default=None, type=str) -parser.add_argument("--trust_remote_code", type=bool, default=False) +parser.add_argument("--trust_remote_code", action="store_true") parser.add_argument("--use_neural_speed", action="store_true") # ======================================= args = parser.parse_args() # transformers version >= 4.32.0 contained the mpt modeling definition. # https://github.com/huggingface/transformers/blob/main/src/transformers/models/mpt/modeling_mpt.py # 4.31.0 for ipex.optimize_transformers -check_min_version("4.31.0") +check_min_version("4.35.2") # get model config if args.peft_model_id: from peft import PeftConfig @@ -196,13 +213,15 @@ config = AutoConfig.from_pretrained( args.model, - torchscript=True - if ( - args.sq - or args.woq_algo in ["AWQ", "TEQ"] - or (args.int8 or args.int8_bf16_mixed) - ) - else False, # torchscript will force `return_dict=False` to avoid jit errors + torchscript=( + True + if ( + args.sq + or args.woq_algo in ["AWQ", "TEQ"] + or (args.int8 or args.int8_bf16_mixed) + ) + else False + ), # torchscript will force `return_dict=False` to avoid jit errors use_cache=True, # to use kv cache. trust_remote_code=args.trust_remote_code, _commit_hash=args._commit_hash, @@ -284,54 +303,80 @@ calib_pad_val=args.calib_pad_val, ) elif args.woq: - if args.woq_algo == "GPTQ": - algorithm_args = { - "act_order": args.gptq_actorder, - "percdamp": args.gptq_percdamp, - "block_size": args.gptq_block_size, - "nsamples": args.gptq_nsamples, - "use_max_length": args.gptq_use_max_length, - "pad_max_length": args.gptq_pad_max_length, - "static_groups": args.gptq_static_groups, - } - quantization_config = WeightOnlyQuantConfig( - compute_dtype=args.woq_compute_dtype, - scale_dtype=args.woq_scale_dtype, - weight_dtype=args.woq_weight_dtype, - scheme=args.woq_scheme, - group_size=args.woq_group_size, - algorithm=args.woq_algo, + if args.woq_algo == "RTN": + quantization_config = RtnConfig( tokenizer=tokenizer, - algorithm_args=algorithm_args, + bits=args.bits, + scheme=args.scheme, + group_size=args.group_size, + compute_dtype=args.compute_dtype, + scale_dtype=args.scale_dtype, + weight_dtype=args.weight_dtype, + ) + elif args.woq_algo == "AWQ": + quantization_config = AwqConfig( + tokenizer=tokenizer, + dataset=args.dataset, + bits=args.bits, + zero_point=False if args.scheme == "sym" else True, + group_size=args.group_size, + max_input_length=args.max_input_length, + compute_dtype=args.compute_dtype, + scale_dtype=args.scale_dtype, + weight_dtype=args.weight_dtype, + calib_iters=args.calib_iters, + ) + elif args.woq_algo == "TEQ": + quantization_config = TeqConfig( + tokenizer=tokenizer, + dataset=args.dataset, + bits=args.bits, + scheme=args.scheme, + group_size=args.group_size, + max_input_length=args.max_input_length, + compute_dtype=args.compute_dtype, + scale_dtype=args.scale_dtype, + weight_dtype=args.weight_dtype, + calib_iters=args.calib_iters, + ) + elif args.woq_algo == "GPTQ": + quantization_config = GPTQConfig( + tokenizer=tokenizer, + dataset=args.dataset, + bits=args.bits, + desc_act=args.desc_act, + damp_percent=args.damp_percent, + sym=True if args.scheme == "sym" else False, + blocksize=args.blocksize, + nsamples=args.nsamples, + static_groups=args.static_groups, + group_size=args.group_size, + max_input_length=args.max_input_length, + compute_dtype=args.compute_dtype, + scale_dtype=args.scale_dtype, + weight_dtype=args.weight_dtype, + calib_iters=args.calib_iters, ) elif args.woq_algo == "AUTOROUND": - algorithm_args = { - "n_samples": args.autoround_nsamples, - "seq_len": args.autoround_seq_len, - "iters": args.calib_iters, - "scale_dtype": "fp32", - } - quantization_config = WeightOnlyQuantConfig( - compute_dtype=args.woq_compute_dtype, - scale_dtype=args.woq_scale_dtype, - weight_dtype=args.woq_weight_dtype, - scheme=args.woq_scheme, - group_size=args.woq_group_size, - algorithm=args.woq_algo, + quantization_config = AutoRoundConfig( tokenizer=tokenizer, - algorithm_args=algorithm_args, - calib_dataset=args.dataset + dataset=args.dataset, + bits=args.bits, + sym=True if args.scheme == "sym" else False, + nsamples=args.nsamples, + group_size=args.group_size, + compute_dtype=args.compute_dtype, + scale_dtype=args.scale_dtype, + weight_dtype=args.weight_dtype, + calib_iters=args.calib_iters, + calib_len=args.calib_len, + lr=args.lr, + minmax_lr=args.minmax_lr, + use_quant_input=args.use_quant_input, ) else: - quantization_config = WeightOnlyQuantConfig( - compute_dtype=args.woq_compute_dtype, - scale_dtype=args.woq_scale_dtype, - weight_dtype=args.woq_weight_dtype, - scheme=args.woq_scheme, - group_size=args.woq_group_size, - algorithm=args.woq_algo, - tokenizer=tokenizer, - ) # default is A32W4G32 + assert False, "Please set the correct '--woq_algo'" + # bitsandbytes elif args.bitsandbytes: # GPU device is need for `load_in_4bit` and `load_in_8bit`. @@ -383,7 +428,6 @@ user_model.save_pretrained(args.output_dir) - # int8 model loading if args.int8 or args.int8_bf16_mixed: # TorchScript model don't attribute generate method, the wrapper is provided. @@ -412,7 +456,9 @@ if args.benchmark: - user_model = user_model.eval() if not (args.int8 or args.int8_bf16_mixed) else user_model + user_model = ( + user_model.eval() if not (args.int8 or args.int8_bf16_mixed) else user_model + ) prompt = "Once upon a time, there existed a little girl, who liked to have adventures. She wanted to go to places and meet new people, and have fun." input_size = tokenizer(prompt, return_tensors="pt").input_ids.size(dim=1) @@ -467,12 +513,15 @@ print("Throughput: {} samples/sec".format(throughput)) if args.accuracy: - user_model = user_model.eval() if not (args.int8 or args.int8_bf16_mixed) else user_model + user_model = ( + user_model.eval() if not (args.int8 or args.int8_bf16_mixed) else user_model + ) args.model = ( peft_config.base_model_name_or_path if args.peft_model_id else args.model ) from intel_extension_for_transformers.llm.evaluation.lm_eval import evaluate - args._commit_hash = "main" if args._commit_hash is None else args._commit_hash + + args._commit_hash = "main" if args._commit_hash is None else args._commit_hash results = evaluate( model="hf-causal", model_args="pretrained=" diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py index 15d4cae794d..5da78878041 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py @@ -7,7 +7,7 @@ from transformers.generation import GenerationConfig import intel_extension_for_pytorch as ipex from intel_extension_for_transformers.llm.utils.generation import _beam_search, _greedy_search -from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig +from intel_extension_for_transformers.transformers import AutoModelForCausalLM, RtnConfig, GPTQConfig from intel_extension_for_transformers.llm.quantization.utils import convert_dtype_str2torch from transformers.utils import check_min_version @@ -76,12 +76,7 @@ "--gptq_nsamples", type=int, default=128, help="Number of calibration data samples." ) parser.add_argument( - "--gptq_use_max_length", - action="store_true", - help="Set all sequence length to be same length of args.gptq_pad_max_length", -) -parser.add_argument( - "--gptq_pad_max_length", + "--max_input_length", type=int, default=2048, help="Calibration dataset sequence max length, this should align with your model config", @@ -118,26 +113,25 @@ quantization_config = None if args.woq: if args.woq_algo == "GPTQ": - algorithm_args = { - "act_order": False, - "percdamp": args.gptq_percdamp, - "block_size": args.gptq_block_size, - "nsamples": args.gptq_nsamples, - "use_max_length": args.gptq_use_max_length, - "pad_max_length": args.gptq_pad_max_length, - } - quantization_config = WeightOnlyQuantConfig( + quantization_config = GPTQConfig( + tokenizer=tokenizer, + dataset=args.dataset, + bits=args.bits, + desc_act=args.desc_act, + damp_percent=args.gptq_percdamp, + sym=True if args.woq_scheme == "sym" else False, + blocksize=args.gptq_block_size, + nsamples=args.gptq_nsamples, + static_groups=args.static_groups, + group_size=args.woq_group_size, + max_input_length=args.max_input_length, compute_dtype=args.compute_dtype, scale_dtype=args.compute_dtype, weight_dtype=args.woq_dtype, - scheme=args.woq_scheme, - group_size=args.woq_group_size, - algorithm=args.woq_algo, - tokenizer=tokenizer, - algorithm_args=algorithm_args, + calib_iters=args.calib_iters, ) else: - quantization_config = WeightOnlyQuantConfig( + quantization_config = RtnConfig( compute_dtype=args.compute_dtype, weight_dtype=args.woq_dtype, group_size=args.woq_group_size, scale_dtype=args.compute_dtype ) #default is A16W4G16 @@ -177,7 +171,7 @@ if user_model is None else user_model user_model = user_model.to(memory_format=torch.channels_last) if quantization_config is None: - quantization_config = WeightOnlyQuantConfig.from_pretrained(args.model) + quantization_config = user_model.quantization_config if hasattr(user_model, "quantization_config") else {} if not args.disable_optimize_transformers: print("Optimize with IPEX...") user_model = ipex.optimize_transformers( diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh index 1fa5e19e70b..d2ffbee20e9 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh +++ b/examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh @@ -62,21 +62,15 @@ function run_tuning { elif [ "${topology}" = "gpt_j_woq_rtn" ]; then model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B" extra_cmd=$extra_cmd" --woq" - pip install torch==2.1.0+cpu torchvision==0.16.0+cpu -f https://download.pytorch.org/whl/torch_stable.html - pip install intel-extension-for-pytorch==2.1.0 elif [ "${topology}" = "gpt_j_woq_bab" ]; then model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B" extra_cmd=$extra_cmd" --bitsandbytes" elif [ "${topology}" = "gpt_j_woq_load4bit" ]; then model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B" - extra_cmd=$extra_cmd" --load_in_4bit True" - pip install torch==2.1.0+cpu torchvision==0.16.0+cpu -f https://download.pytorch.org/whl/torch_stable.html - pip install intel-extension-for-pytorch==2.1.0 + extra_cmd=$extra_cmd" --load_in_4bit" elif [ "${topology}" = "gpt_j_woq_load8bit" ]; then model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B" - extra_cmd=$extra_cmd" --load_in_8bit True" - pip install torch==2.1.0+cpu torchvision==0.16.0+cpu -f https://download.pytorch.org/whl/torch_stable.html - pip install intel-extension-for-pytorch==2.1.0 + extra_cmd=$extra_cmd" --load_in_8bit " elif [ "${topology}" = "gpt_j_mp" ]; then model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B" extra_cmd=$extra_cmd" --mixed_precision" @@ -132,19 +126,19 @@ function run_tuning { model_name_or_path="THUDM/chatglm3-6b" extra_cmd=$extra_cmd" --sq --alpha ${alpha}" extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" - extra_cmd=$extra_cmd" --trust_remote_code True" + extra_cmd=$extra_cmd" --trust_remote_code" elif [ "${topology}" = "chatglm2_6b" ]; then alpha=0.75 model_name_or_path="THUDM/chatglm2-6b" extra_cmd=$extra_cmd" --sq --alpha ${alpha}" extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" - extra_cmd=$extra_cmd" --trust_remote_code True" + extra_cmd=$extra_cmd" --trust_remote_code" elif [ "${topology}" = "chatglm_6b" ]; then alpha=0.75 model_name_or_path="THUDM/chatglm-6b" extra_cmd=$extra_cmd" --sq --alpha ${alpha}" extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" - extra_cmd=$extra_cmd" --trust_remote_code True" + extra_cmd=$extra_cmd" --trust_remote_code" pip install transformers==4.33 elif [ "${topology}" = "falcon_7b" ]; then alpha=0.7 @@ -157,58 +151,64 @@ function run_tuning { model_name_or_path="baichuan-inc/Baichuan-7B" extra_cmd=$extra_cmd" --sq --alpha ${alpha}" extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" - extra_cmd=$extra_cmd" --trust_remote_code True" + extra_cmd=$extra_cmd" --trust_remote_code" + pip install transformers==4.33 elif [ "${topology}" = "baichuan_13b" ]; then alpha=0.85 model_name_or_path="baichuan-inc/Baichuan-13B-Base" extra_cmd=$extra_cmd" --sq --alpha ${alpha}" extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" - extra_cmd=$extra_cmd" --trust_remote_code True" + extra_cmd=$extra_cmd" --trust_remote_code" + extra_cmd=$extra_cmd" --_commit_hash 14d5b0e204542744900f6fb52422c6d633bdcb00" + pip install transformers==4.33 elif [ "${topology}" = "baichuan2_7b" ]; then alpha=0.85 model_name_or_path="baichuan-inc/Baichuan2-7B-Base" extra_cmd=$extra_cmd" --sq --alpha ${alpha}" extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" - extra_cmd=$extra_cmd" --trust_remote_code True" + extra_cmd=$extra_cmd" --trust_remote_code" + pip install transformers==4.33 elif [ "${topology}" = "baichuan2_13b" ]; then alpha=0.55 model_name_or_path="baichuan-inc/Baichuan2-13B-Base" extra_cmd=$extra_cmd" --sq --alpha ${alpha}" extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" - extra_cmd=$extra_cmd" --trust_remote_code True" + extra_cmd=$extra_cmd" --trust_remote_code" + pip install transformers==4.33 elif [ "${topology}" = "qwen_7b" ]; then alpha=0.9 model_name_or_path="Qwen/Qwen-7B" extra_cmd=$extra_cmd" --sq --alpha ${alpha}" extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" - extra_cmd=$extra_cmd" --trust_remote_code True" + extra_cmd=$extra_cmd" --trust_remote_code" + extra_cmd=$extra_cmd" --_commit_hash f7bc352f27bb1c02ee371a4576942a7d96c8bb97" + pip install transformers==4.35.2 elif [ "${topology}" = "mistral_7b" ]; then alpha=0.8 model_name_or_path="Intel/neural-chat-7b-v3" extra_cmd=$extra_cmd" --sq --alpha ${alpha}" extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" - extra_cmd=$extra_cmd" --trust_remote_code True" + extra_cmd=$extra_cmd" --trust_remote_code" elif [ "${topology}" = "phi_1b" ]; then alpha=0.5 model_name_or_path="susnato/phi-1_dev" extra_cmd=$extra_cmd" --sq --alpha ${alpha}" extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" - extra_cmd=$extra_cmd" --trust_remote_code True" + extra_cmd=$extra_cmd" --trust_remote_code" pip install transformers==4.36.1 elif [ "${topology}" = "phi_1_5b" ]; then alpha=0.5 model_name_or_path="susnato/phi-1_5_dev" extra_cmd=$extra_cmd" --sq --alpha ${alpha}" extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" - extra_cmd=$extra_cmd" --trust_remote_code True" + extra_cmd=$extra_cmd" --trust_remote_code" pip install transformers==4.36.1 elif [ "${topology}" = "llama2_7b_int4_gptq" ]; then model_name_or_path="meta-llama/Llama-2-7b-hf" - extra_cmd=$extra_cmd" --woq --woq_weight_dtype int4_clip --woq_compute_dtype fp32" - extra_cmd=$extra_cmd" --woq_algo "GPTQ" --gptq_actorder --gptq_block_size 128 --gptq_pad_max_length 2048 --gptq_use_max_length" + extra_cmd=$extra_cmd" --woq --bits 4 --weight_dtype int4_clip --compute_dtype fp32 --scheme asym " + extra_cmd=$extra_cmd" --woq_algo "GPTQ" --desc_act --blocksize 128 --max_input_length 2048 " extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}" - extra_cmd=$extra_cmd" --trust_remote_code True" - pip install transformers==4.35.2 + extra_cmd=$extra_cmd" --trust_remote_code" fi if [ ${script} = "run_generation.py" ];then diff --git a/intel_extension_for_transformers/llm/evaluation/lm_eval/models/huggingface.py b/intel_extension_for_transformers/llm/evaluation/lm_eval/models/huggingface.py index 1c7aaa413ed..a309a8dc45f 100644 --- a/intel_extension_for_transformers/llm/evaluation/lm_eval/models/huggingface.py +++ b/intel_extension_for_transformers/llm/evaluation/lm_eval/models/huggingface.py @@ -615,9 +615,12 @@ class AutoCausalLM(HuggingFaceAutoLM): def __init__(self, *args, pretrained, model_format, **kwargs): self.model_format = model_format if self.model_format == "runtime": - from intel_extension_for_transformers.transformers import WeightOnlyQuantConfig + from intel_extension_for_transformers.transformers import RtnConfig, AwqConfig, GPTQConfig, AutoRoundConfig use_gptq = kwargs.pop("use_gptq", False) - self.woq_config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4", use_gptq=use_gptq) + if use_gptq: + self.woq_config = GPTQConfig(bits=4, compute_dtype="int8", weight_dtype="int4") + else: + self.woq_config = RtnConfig(bits=4, compute_dtype="int8", weight_dtype="int4") super().__init__(*args, pretrained=pretrained, model_format=model_format, **kwargs) if self.model_format == "runtime": diff --git a/intel_extension_for_transformers/llm/quantization/gptq_utils.py b/intel_extension_for_transformers/llm/quantization/gptq_utils.py index 53807988b55..d4f64182eb6 100644 --- a/intel_extension_for_transformers/llm/quantization/gptq_utils.py +++ b/intel_extension_for_transformers/llm/quantization/gptq_utils.py @@ -18,7 +18,7 @@ def unpack_weight(qweight, scales, qzeros, q_config): - bits = q_config["bits"] + bits = q_config.bits wf = torch.tensor([[0, 4, 8, 12, 16, 20, 24, 28]], dtype=torch.int32) zeros = torch.bitwise_right_shift( torch.unsqueeze(qzeros, 2).expand(-1, -1, 32 // bits), wf.unsqueeze(0) diff --git a/intel_extension_for_transformers/llm/quantization/nn/modules.py b/intel_extension_for_transformers/llm/quantization/nn/modules.py index 399b3028aec..ae8f959f1a8 100644 --- a/intel_extension_for_transformers/llm/quantization/nn/modules.py +++ b/intel_extension_for_transformers/llm/quantization/nn/modules.py @@ -25,7 +25,6 @@ from peft.utils.other import transpose from intel_extension_for_transformers.llm.quantization.autograd import matmul_kbit - torch.ops.load_library( os.path.join(os.path.abspath(os.path.dirname(__file__)), "../../..", "libqbits.so") ) @@ -171,9 +170,21 @@ def set_gptq_weights_bias( bias=None, ): - if q_config.gptq_quantize_config["desc_act"]: + if int_weight.is_meta: + int_weight = torch.ones(int_weight.shape, dtype=torch.int8) + gptq_scales = torch.rand( + self.in_features // self.blocksize, + self.out_features, + dtype=torch.float16, + ) + gptq_zeros = torch.ones( + self.in_features // self.blocksize, self.out_features, dtype=torch.int8 + ) + if q_config.quant_method.value != "autoround" and q_config.desc_act: + g_idx = torch.zeros(self.blocksize, dtype=torch.int32) + if q_config.quant_method.value != "autoround" and q_config.desc_act: int_weight2 = int_weight.clone() - group_size = q_config.gptq_quantize_config["group_size"] + group_size = q_config.group_size group_dict = {} for i in range(len(g_idx)): group_idx = g_idx[i].item() @@ -186,14 +197,14 @@ def set_gptq_weights_bias( int_weight2[target_idx] = int_weight[i] int_weight = int_weight2 - if q_config.gptq_quantize_config["bits"] == 4: + if q_config.bits == 4: int_weight = (int_weight - 8) * 16 gptq_scales = gptq_scales / 16 gptq_zeros = (gptq_zeros - 8) * 16 - if q_config.gptq_quantize_config["sym"]: + if q_config.sym: gptq_zeros = torch.empty(0, dtype=torch.int8) - if not q_config.gptq_quantize_config["desc_act"]: + if q_config.quant_method.value == "autoround" or (not q_config.desc_act): g_idx = torch.empty(0, dtype=torch.int32) packw = torch.ops.bestlaop.woq_packq( @@ -204,7 +215,7 @@ def set_gptq_weights_bias( q_config.weight_dtype, q_config.scale_dtype, q_config.compute_dtype, - not q_config.gptq_quantize_config["sym"], + not q_config.sym, self.blocksize, ) diff --git a/intel_extension_for_transformers/llm/quantization/optimization.py b/intel_extension_for_transformers/llm/quantization/optimization.py index 6a021fb9abf..bb0f33174d9 100644 --- a/intel_extension_for_transformers/llm/quantization/optimization.py +++ b/intel_extension_for_transformers/llm/quantization/optimization.py @@ -32,12 +32,17 @@ def optimize(self, model, use_neural_speed=False): optimized_model = model from intel_extension_for_transformers.transformers import ( MixedPrecisionConfig, - WeightOnlyQuantConfig, + RtnConfig, + AwqConfig, + TeqConfig, + GPTQConfig, + AutoRoundConfig, BitsAndBytesConfig ) - assert type(self.optimization_config) in [MixedPrecisionConfig, WeightOnlyQuantConfig, BitsAndBytesConfig], \ - f"Expect optimization_config be an object of MixedPrecisionConfig, WeightOnlyQuantConfig" + \ - " or BitsAndBytesConfig,got {type(self.optimization_config)}." + assert type(self.optimization_config) in [MixedPrecisionConfig, RtnConfig, AwqConfig, TeqConfig, + GPTQConfig, AutoRoundConfig, BitsAndBytesConfig], \ + f"Expect optimization_config be an object of MixedPrecisionConfig, RtnConfig, AwqConfig, TeqConfig," + \ + " GPTQConfig, AutoRoundConfig or BitsAndBytesConfig,got {type(self.optimization_config)}." config = self.optimization_config if re.search("flan-t5", model_name, re.IGNORECASE): from intel_extension_for_transformers.transformers import AutoModelForSeq2SeqLM diff --git a/intel_extension_for_transformers/llm/quantization/utils.py b/intel_extension_for_transformers/llm/quantization/utils.py index ad1d81db593..ec771e85e6f 100644 --- a/intel_extension_for_transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/llm/quantization/utils.py @@ -106,10 +106,15 @@ def _replace_linear( current_key_name.append(name) is_removed = False - if (isinstance(module, torch.nn.Linear) or isinstance(module, WeightOnlyLinear) - or (is_autoround_available() and isinstance(module, auto_round_woqlinear)) or (is_ipex_available() - and isinstance(module, ipex.nn.utils._weight_prepack._IPEXLinear))) \ - and (name not in modules_to_not_convert): + if ( + isinstance(module, torch.nn.Linear) + or isinstance(module, WeightOnlyLinear) + or (is_autoround_available() and isinstance(module, auto_round_woqlinear)) + or ( + is_ipex_available() + and isinstance(module, ipex.nn.utils._weight_prepack._IPEXLinear) + ) + ) and (name not in modules_to_not_convert): # Check if the current key is not in the `modules_to_not_convert` if not any( key in ".".join(current_key_name) for key in modules_to_not_convert @@ -117,7 +122,11 @@ def _replace_linear( with init_empty_weights(): in_features = module.in_features out_features = module.out_features - if device == "cpu" or device == torch.device("cpu") or device == "auto": + if ( + device == "cpu" + or device == torch.device("cpu") + or device == "auto" + ): from .nn.modules import ( QuantizedLinearQBits, ) # TODO: QuantizedLinearINT4, QuantizedLinearINT8 @@ -146,28 +155,59 @@ def _replace_linear( scale_dtype=quantization_config.scale_dtype, blocksize=quantization_config.group_size, scheme=quantization_config.scheme, - compression_dtype=module.compression_dtype - if hasattr(module, "compression_dtype") else torch.int8, - compression_dim=module.compression_dim if hasattr(module, "compression_dim") else 0, + compression_dtype=( + module.compression_dtype + if hasattr(module, "compression_dtype") + else torch.int8 + ), + compression_dim=( + module.compression_dim + if hasattr(module, "compression_dim") + else 0 + ), device=device, - use_optimum_format=module.use_optimum_format - if hasattr(module, "use_optimum_format") else False, + use_optimum_format=( + module.use_optimum_format + if hasattr(module, "use_optimum_format") + else False + ), ) - if quantization_config.algorithm == "GPTQ": - g_idx = module.g_idx if hasattr(module, "g_idx") else \ - torch.zeros(in_features, dtype=torch.int32).to(device) + if quantization_config.quant_method.value == "gptq": + g_idx = ( + module.g_idx + if hasattr(module, "g_idx") + else torch.zeros(in_features, dtype=torch.int32).to( + device + ) + ) else: g_idx = None model._modules[name].set_scales_zps_gidx( - module.scales if hasattr(module, "scales") else torch.ones( - (out_features, math.ceil(in_features / quantization_config.group_size)), - dtype=convert_dtype_str2torch(quantization_config.compute_dtype), - device=torch.device(device)), + ( + module.scales + if hasattr(module, "scales") + else torch.ones( + ( + out_features, + math.ceil( + in_features / quantization_config.group_size + ), + ), + dtype=convert_dtype_str2torch( + quantization_config.compute_dtype + ), + device=torch.device(device), + ) + ), module.qzeros if hasattr(module, "qzeros") else None, - g_idx + g_idx, ) else: - raise Exception("{} device Unsupported weight only quantization!".format(device)) + raise Exception( + "{} device Unsupported weight only quantization!".format( + device + ) + ) is_replaced = True # Store the module class in case we need to transpose the weight later @@ -175,29 +215,30 @@ def _replace_linear( # Force requires grad to False to avoid unexpected errors model._modules[name].requires_grad_(False) if device == "cpu" or device == torch.device("cpu") or device == "auto": - if not empty_weights: - if quantization_config.algorithm == "GPTQ" or quantization_config.algorithm == "AUTOROUND": - from .gptq_utils import unpack_weight + if quantization_config.quant_method.value in ["gptq", "autoround"]: + from .gptq_utils import unpack_weight + + if not empty_weights: int_weight, gptq_scales, gptq_zeros = unpack_weight( module.qweight, module.scales, module.qzeros, - quantization_config.gptq_quantize_config, - ) - int_weight = int_weight.view(-1, int_weight.shape[-1]) - model._modules[name].set_gptq_weights_bias( - int_weight, - gptq_scales, - gptq_zeros, - module.g_idx if hasattr(module, "g_idx") else None, quantization_config, - bias=None if module.bias is None else module.bias.data, ) + int_weight = int_weight.view(-1, int_weight.shape[-1]) else: - model._modules[name].set_weights_bias( - module.weight.data, - None if module.bias is None else module.bias.data, - ) + int_weight = module.weight + int_weight = int_weight.view(int_weight.shape[-1], -1) + gptq_scales = None + gptq_zeros = None + model._modules[name].set_gptq_weights_bias( + int_weight, + gptq_scales, + gptq_zeros, + module.g_idx if hasattr(module, "g_idx") else None, + quantization_config, + bias=None if module.bias is None else module.bias.data, + ) else: model._modules[name].set_weights_bias( module.weight.data, @@ -205,19 +246,23 @@ def _replace_linear( ) else: if not hasattr(module, "qweight"): - n_pack = 8 // DTYPE_BITS_MAPPING[quantization_config.weight_dtype] + n_pack = ( + 8 // DTYPE_BITS_MAPPING[quantization_config.weight_dtype] + ) weight = torch.zeros( (math.ceil(out_features / n_pack), in_features), - dtype=torch.int8, device=torch.device(device) + dtype=torch.int8, + device=torch.device(device), ) model._modules[name].set_weights_bias( module.qweight.data if hasattr(module, "qweight") else weight, - None if module.bias is None else module.bias.data) + None if module.bias is None else module.bias.data, + ) del module gc.collect() is_removed = True - if not is_removed and len(list(module.children())) > 0: # pylint: disable=E1101 + if not is_removed and len(list(module.children())) > 0: # pylint: disable=E1101 _, is_replaced = _replace_linear( module, modules_to_not_convert, @@ -235,12 +280,16 @@ def _replace_linear( def convert_to_quantized_model(model, config, device="cpu"): if device == "xpu" or device == torch.device("xpu"): import intel_extension_for_pytorch - assert hasattr(torch, "xpu") and torch.xpu.is_available(), "There is no xpu device in this system!" + + assert ( + hasattr(torch, "xpu") and torch.xpu.is_available() + ), "There is no xpu device in this system!" calib_dataloader = config.calib_dataloader calib_func = config.calib_func calib_iters = config.calib_iters model_device = next(model.parameters()).device - if calib_dataloader is None and config.algorithm in ["TEQ", "AWQ", "GPTQ", "AUTOROUND"]: + + if calib_dataloader is None and config.quant_method.value not in ["rtn"]: from datasets import load_dataset from torch.utils.data import DataLoader @@ -251,9 +300,9 @@ def convert_to_quantized_model(model, config, device="cpu"): if config.tokenizer is None: logger.error( "Please provide the tokenizer or provide calib_func directly," - + " the following is how to get tokenizer. \n" + - " from transformer import AutoTokenizer \n" + - " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n" + + " the following is how to get tokenizer. \n" + + " from transformer import AutoTokenizer \n" + + " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n" ) exit(0) @@ -266,8 +315,9 @@ def tokenize_function(examples): example = config.tokenizer(examples["text"]) else: logger.error( - "Please check dataset prompt identifier," + - " NeelNanda/pile-10k is default used calibration dataset.") + "Please check dataset prompt identifier," + + " NeelNanda/pile-10k is default used calibration dataset." + ) exit(0) return example @@ -278,7 +328,11 @@ def collate_batch(batch): input_ids_padded = [] for text in batch: input_ids = text["input_ids"] - input_ids = input_ids[:512] if (len(input_ids) > 512 and config.algorithm != "GPTQ") else input_ids + input_ids = ( + input_ids[:512] + if (len(input_ids) > 512 and config.quant_method.value != "gptq") + else input_ids + ) input_ids_padded.append(input_ids) return torch.vstack(input_ids_padded) @@ -286,21 +340,22 @@ def collate_batch_for_autoround(batch): input_ids_padded = [] for text in batch: input_ids = text["input_ids"] - if input_ids.shape[0] < config.algorithm_args["seq_len"]: + if input_ids.shape[0] < config.calib_len: continue - input_ids = input_ids[:config.algorithm_args["seq_len"]] + input_ids = input_ids[: config.calib_len] input_ids_list = input_ids.tolist() - if input_ids_list.count(input_ids_list[-1]) > config.algorithm_args["seq_len"] // 2: + if input_ids_list.count(input_ids_list[-1]) > config.calib_len // 2: continue input_ids_padded.append(input_ids) if len(input_ids_padded) == 0: return None return torch.vstack(input_ids_padded) - if config.algorithm == "AUTOROUND": + + if config.quant_method.value == "autoround": calib_dataloader = DataLoader( tokenized_dataset, - batch_size=1, + batch_size=8, shuffle=False, collate_fn=collate_batch_for_autoround, ) @@ -311,7 +366,7 @@ def collate_batch_for_autoround(batch): shuffle=False, collate_fn=collate_batch, ) - if calib_func is None and config.algorithm in ["AWQ"]: + if calib_func is None and config.quant_method.value == "awq": def default_calib_func(model): """ @@ -321,12 +376,16 @@ def default_calib_func(model): for i, (input_ids) in enumerate(calib_dataloader): if i >= calib_iters: break - model(input_ids=input_ids, ) + model( + input_ids=input_ids, + ) calib_func = default_calib_func - logger.info("The default calibration function is used, " + - "the calibration dataset is NeelNanda/pile-10k," + - "batchsize is 1 and calibration iteration is 100.") + logger.info( + "The default calibration function is used, " + + "the calibration dataset is NeelNanda/pile-10k," + + "batchsize is 1 and calibration iteration is 100." + ) if config.weight_dtype in ["fp8_e4m3", "fp8_e5m2"]: return replace_linear(model, None, None, config, device=device) else: @@ -337,18 +396,60 @@ def default_calib_func(model): dtype = "int4" else: dtype = config.weight_dtype - recipes = { - "rtn_args": { - "enable_full_range": True - if "fullrange" in config.weight_dtype - else False, - "enable_mse_search": config.mse_range, - }, - "awq_args": config.algorithm_args.update({"enable_mse_search": config.mse_range}) - if config.algorithm == "AWQ" and config.algorithm_args is not None else {}, - "gptq_args": config.algorithm_args if config.algorithm == "GPTQ" else None, - "autoround_args": config.algorithm_args if config.algorithm == "AUTOROUND" else None - } + # mapping to INC config + if config.quant_method.value == "rtn": + recipes = { + "rtn_args": { + "enable_full_range": ( + True if "fullrange" in config.weight_dtype else False + ), + "enable_mse_search": config.mse_range, + } + } + algorithm = "RTN" + elif config.quant_method.value == "awq": + recipes = { + "rtn_args": { + "enable_full_range": ( + True if "fullrange" in config.weight_dtype else False + ), + "enable_mse_search": config.mse_range, + }, + "awq_args": {}, + } + algorithm = "AWQ" + elif config.quant_method.value == "teq": + recipes = {"teq_args": {}} + algorithm = "TEQ" + elif config.quant_method.value == "gptq": + recipes = { + "gptq_args": { + "act_order": config.desc_act, + "percdamp": config.damp_percent, + "block_size": config.blocksize, + "nsamples": config.nsamples, + "use_max_length": True if config.max_input_length else False, + "pad_max_length": config.max_input_length, + "static_groups": config.static_groups, + } + } + algorithm = "GPTQ" + elif config.quant_method.value == "autoround": + recipes = { + "autoround_args": { + "n_samples": config.nsamples, + "seq_len": config.calib_len, + "iters": config.calib_iters, + "scale_dtype": config.scale_dtype, + "use_quant_input": config.use_quant_input, + "lr": config.lr, + "minmax_lr": config.minmax_lr, + } + } + algorithm = "AUTOROUND" + else: + assert False, "The Supported algorithm are RTN, AWQ, TEQ, GPTQ, AUTOROUND" + conf = PostTrainingQuantConfig( approach="weight_only", op_type_dict={ @@ -358,22 +459,20 @@ def default_calib_func(model): "dtype": dtype, "group_size": config.group_size, # -1 (per-channel) "scheme": config.scheme, - "algorithm": config.algorithm, + "algorithm": algorithm, }, }, }, op_name_dict={ - '.*lm_head': { # re.match - "weight": { - 'dtype': 'fp32' - }, + ".*lm_head": { # re.match + "weight": {"dtype": "fp32"}, }, }, recipes=recipes, ) # TEQ: set calib_func=None, use default training func as calib_func # RTN: doesn't need calib_func - if config.algorithm in ["TEQ", "RTN", "GPTQ", "AUTOROUND"]: + if config.quant_method.value not in ["awq"]: calib_func = None orig_dtype = torch.float32 @@ -382,59 +481,34 @@ def default_calib_func(model): if orig_dtype != torch.float32: model.to(dtype=torch.float32) break - inc_model = quantization.fit(model, - conf, - calib_func=calib_func, - calib_dataloader=calib_dataloader) + inc_model = quantization.fit( + model, conf, calib_func=calib_func, calib_dataloader=calib_dataloader + ) if device == "xpu" or device == torch.device("xpu"): - model = inc_model.export_compressed_model(compression_dtype=torch.int8, - compression_dim=0, - use_optimum_format=False, - scale_dtype=convert_dtype_str2torch(config.scale_dtype)) - q_model = replace_linear(model, - None, - None, - config, - device=device) - else: - if config.algorithm == "GPTQ": - inc_model = inc_model.export_compressed_model(use_optimum_format=True) - inc_model.eval() - quantize_config = { - "bits": bits, - "group_size": config.group_size, - "damp_percent": config.algorithm_args["percdamp"], - "desc_act": config.algorithm_args["act_order"], - "sym": True if config.scheme == "sym" else False, - "true_sequential": True, - "model_name_or_path": "null", - "model_file_base_name": "model", - } + model = inc_model.export_compressed_model( + compression_dtype=torch.int8, + compression_dim=0, + use_optimum_format=False, + scale_dtype=convert_dtype_str2torch(config.scale_dtype), + ) - setattr(config, "gptq_quantize_config", quantize_config) - q_model = replace_linear(inc_model, None, None, config, device=device) - elif config.algorithm == "AUTOROUND": + q_model = replace_linear(model, None, None, config, device=device) + else: + if config.quant_method in ["gptq", "autoround"]: inc_model = inc_model.export_compressed_model(use_optimum_format=True) inc_model.eval() - quantize_config = { - "bits": bits, - "group_size": config.group_size, - "desc_act": False, - "sym": True if config.scheme == "sym" else False, - "true_sequential": True, - "model_name_or_path": "null", - "model_file_base_name": "model", - } - - setattr(config, "gptq_quantize_config", quantize_config) q_model = replace_linear(inc_model, None, None, config, device=device) else: - q_model = replace_linear(inc_model.model, None, None, config, device=device) + q_model = replace_linear( + inc_model.model, None, None, config, device=device + ) if orig_dtype != torch.float32: q_model.to(dtype=orig_dtype) + return q_model.to(device) + def convert_dtype_str2torch(str_dtype): if str_dtype == "int8": return torch.int8 @@ -469,5 +543,7 @@ def get_bits(config): elif "int4" in config.weight_dtype: bits = 4 else: - assert False, "Unsupported {} for quantize weight only by IPEX backend".format(config.weight_dtype) + assert False, "Unsupported {} for quantize weight only by IPEX backend".format( + config.weight_dtype + ) return bits diff --git a/intel_extension_for_transformers/llm/runtime/neural_speed/README.md b/intel_extension_for_transformers/llm/runtime/neural_speed/README.md index 6959f5ff63f..afb4ba18604 100644 --- a/intel_extension_for_transformers/llm/runtime/neural_speed/README.md +++ b/intel_extension_for_transformers/llm/runtime/neural_speed/README.md @@ -63,11 +63,11 @@ outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300) To directly load a GPTQ/AWQ/AutoRound model, here is the sample code: ```python from transformers import AutoTokenizer, TextStreamer -from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig +from intel_extension_for_transformers.transformers import AutoModelForCausalLM, GPTQConfig # Download Hugging Face GPTQ model to local path model_name = "PATH_TO_MODEL" # local path to model -woq_config = WeightOnlyQuantConfig(use_gptq=True) # use_awq=True for AWQ models, and use_autoround=True for AutoRound models +woq_config = GPTQConfig(bits=4) # use AwqConfig for AWQ models, and AutoRoundConfig for AutoRound models prompt = "Once upon a time, a little girl" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @@ -80,7 +80,7 @@ outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300) To directly load a GGUF model, here is the sample code: ```python from transformers import AutoTokenizer, TextStreamer -from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig +from intel_extension_for_transformers.transformers import AutoModelForCausalLM # Specify the GGUF repo on the Hugginface model_name = "TheBloke/Llama-2-7B-Chat-GGUF" @@ -100,9 +100,9 @@ outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300) To enable [StreamingLLM for infinite inference](./docs/infinite_inference.md), here is the sample code: ```python from transformers import AutoTokenizer, TextStreamer -from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig +from intel_extension_for_transformers.transformers import AutoModelForCausalLM, RtnConfig model_name = "Intel/neural-chat-7b-v3-1" # Hugging Face model_id or local model -woq_config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4") +woq_config = RtnConfig(compute_dtype="int8", weight_dtype="int4") prompt = "Once upon a time, there existed a little girl," tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @@ -118,21 +118,20 @@ outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300, ctx_size To use whisper to Audio-to-text, here is the sample code ```python -from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig +from intel_extension_for_transformers.transformers import AutoModelForCausalLM, RtnConfig model_name = "Local path for whisper" # please use local path -woq_config = WeightOnlyQuantConfig(use_ggml=True) #Currently, only Q40 is supported +woq_config = RtnConfig(use_ggml=True) #Currently, only Q40 is supported model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config) model('Local audio file') ``` https://github.com/intel/intel-extension-for-transformers/assets/109187816/1698dcda-c9ec-4f44-b159-f4e9d67ab15b -Argument description of WeightOnlyQuantConfig ([supported MatMul combinations](#supported-matrix-multiplication-data-types-combinations)): +Argument description of RtnConfig, AwqConfig, TeqConfig, GPTQConfig, AutoroundConfig([supported MatMul combinations](#supported-matrix-multiplication-data-types-combinations)): | Argument | Type | Description | | -------------- | ---------- | ----------------------------------------------------------------------- | | compute_dtype | String | Data type of Gemm computation: int8/bf16/fp16/fp32 (default: fp32) | | weight_dtype | String | Data type of quantized weight: int4/int8/fp8(=fp8_e4m3)/fp8_e5m2/fp4(=fp4_e2m1)/nf4 (default int4) | -| alg | String | Quantization algorithm: sym/asym (default sym) | | group_size | Int | Group size: Int, 32/128/-1 (per channel) (default: 32) | | scale_dtype | String | Data type of scales: fp32/bf16/fp8 (default fp32) | | use_ggml | Bool | Enable ggml for quantization and inference (default: False) | @@ -171,11 +170,11 @@ Argument description of generate function: Chat with LLaMA2: ```python from transformers import AutoTokenizer, TextStreamer -from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig +from intel_extension_for_transformers.transformers import AutoModelForCausalLM, RtnConfig # Please change to local path to model, llama2 does not support online conversion, currently. model_name = "meta-llama/Llama-2-7b-chat-hf" -woq_config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4") +woq_config = RtnConfig(compute_dtype="int8", weight_dtype="int4") tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) streamer = TextStreamer(tokenizer) model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True) @@ -192,10 +191,10 @@ while True: Chat with ChatGLM2: ```python from transformers import AutoTokenizer, TextStreamer -from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig +from intel_extension_for_transformers.transformers import AutoModelForCausalLM, RtnConfig model_name = "THUDM/chatglm2-6b" # or local path to model -woq_config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4") +woq_config = RtnConfig(compute_dtype="int8", weight_dtype="int4") tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) streamer = TextStreamer(tokenizer) model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True) @@ -212,10 +211,10 @@ while True: Chat with Qwen: ```python from transformers import AutoTokenizer, TextStreamer -from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig +from intel_extension_for_transformers.transformers import AutoModelForCausalLM, RtnConfig model_name = "Qwen/Qwen-7B-Chat" # or local path to model -woq_config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4") +woq_config = RtnConfig(compute_dtype="int8", weight_dtype="int4") tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) streamer = TextStreamer(tokenizer) model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True) diff --git a/intel_extension_for_transformers/llm/runtime/neural_speed/tests/test_neural_speed.py b/intel_extension_for_transformers/llm/runtime/neural_speed/tests/test_neural_speed.py index 0c917eb1c1e..69a91090eab 100644 --- a/intel_extension_for_transformers/llm/runtime/neural_speed/tests/test_neural_speed.py +++ b/intel_extension_for_transformers/llm/runtime/neural_speed/tests/test_neural_speed.py @@ -21,7 +21,11 @@ import unittest from transformers import AutoTokenizer, TextStreamer -from intel_extension_for_transformers.transformers import AutoModel, WeightOnlyQuantConfig, AutoModelForCausalLM +from intel_extension_for_transformers.transformers import ( + AutoModel, + RtnConfig, + AutoModelForCausalLM +) from neural_speed.convert import convert_model from neural_speed import Model @@ -55,7 +59,7 @@ def test_llm_runtime(self): print(tokenizer.decode(pt_generate_ids)) # check output ids - woq_config = WeightOnlyQuantConfig(use_quant=False) + woq_config = RtnConfig(use_quant=False) itrex_model = AutoModel.from_pretrained(model_name, quantization_config=woq_config, use_neural_speed=True, trust_remote_code=True) itrex_generate_ids = itrex_model.generate(inputs.input_ids, do_sample=False, max_new_tokens=100)[0] print(tokenizer.decode(itrex_generate_ids)) @@ -64,10 +68,10 @@ def test_llm_runtime(self): # check diff of logits woq_configs = { - "fp32": WeightOnlyQuantConfig(use_quant=False), - # "ggml_int4": WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4",use_ggml=True), - "jblas_int4": WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4"), - # "jblas_int8": WeightOnlyQuantConfig(compute_dtype="bf16", weight_dtype="int8"), + "fp32": RtnConfig(use_quant=False), + # "ggml_int4": RtnConfig(compute_dtype="int8", weight_dtype="int4",use_ggml=True), + "jblas_int4": RtnConfig(bits=8, compute_dtype="int8", weight_dtype="int4"), + # "jblas_int8": RtnConfig(compute_dtype="bf16", weight_dtype="int8"), } for config_type in woq_configs: itrex_model = AutoModel.from_pretrained(model_name, quantization_config=woq_configs[config_type], @@ -114,7 +118,7 @@ def test_beam_search(self): pt_generate_ids = torch.load("/tf_dataset2/inc-ut/nlptoolkit_ut_model/beam_pt_generate_ids.pth").tolist() # llm runtime fp32 - woq_config = WeightOnlyQuantConfig(use_quant=False) + woq_config = RtnConfig(use_quant=False) itrex_model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=woq_config, trust_remote_code=True) itrex_generate_ids = itrex_model.generate( diff --git a/intel_extension_for_transformers/neural_chat/chatbot.py b/intel_extension_for_transformers/neural_chat/chatbot.py index 89d0dc35333..a9c59d83c96 100644 --- a/intel_extension_for_transformers/neural_chat/chatbot.py +++ b/intel_extension_for_transformers/neural_chat/chatbot.py @@ -364,12 +364,16 @@ def optimize_model(model, config, use_neural_speed=False): logging.error(f"Exception: {e}") from intel_extension_for_transformers.transformers import ( MixedPrecisionConfig, - WeightOnlyQuantConfig, + RtnConfig, + AwqConfig, + TeqConfig, + GPTQConfig, + AutoRoundConfig, BitsAndBytesConfig ) if type(config) == MixedPrecisionConfig: set_latest_error(ErrorCodes.ERROR_AMP_OPTIMIZATION_FAIL) - elif type(config) == WeightOnlyQuantConfig: + elif type(config) in [RtnConfig, AwqConfig, TeqConfig, GPTQConfig, AutoRoundConfig]: set_latest_error(ErrorCodes.ERROR_WEIGHT_ONLY_QUANT_OPTIMIZATION_FAIL) elif type(config) == BitsAndBytesConfig: set_latest_error(ErrorCodes.ERROR_BITS_AND_BYTES_OPTIMIZATION_FAIL) diff --git a/intel_extension_for_transformers/neural_chat/config.py b/intel_extension_for_transformers/neural_chat/config.py index a3f0425c22e..7e38b4d6897 100644 --- a/intel_extension_for_transformers/neural_chat/config.py +++ b/intel_extension_for_transformers/neural_chat/config.py @@ -493,13 +493,19 @@ def __init__(self, use_hpu_graphs = True if self.device == "hpu" else False) from intel_extension_for_transformers.transformers import ( MixedPrecisionConfig, - WeightOnlyQuantConfig, - BitsAndBytesConfig + RtnConfig, + AwqConfig, + TeqConfig, + GPTQConfig, + AutoRoundConfig, + BitsAndBytesConfig, ) self.optimization_config = optimization_config if optimization_config is not None else \ MixedPrecisionConfig(dtype="float16" if self.device == "cuda" else "bfloat16") - assert type(self.optimization_config) in [MixedPrecisionConfig, WeightOnlyQuantConfig, BitsAndBytesConfig], \ - f"Expect optimization_config be an object of MixedPrecisionConfig, WeightOnlyQuantConfig" + \ - " or BitsAndBytesConfig,got {type(self.optimization_config)}." + assert type(self.optimization_config) in \ + [MixedPrecisionConfig, RtnConfig, AwqConfig, TeqConfig, GPTQConfig, AutoRoundConfig, BitsAndBytesConfig], \ + f"Expect optimization_config be an object of MixedPrecisionConfig, RtnConfig, AwqConfig, TeqConfig, " + \ + "GPTQConfig, AutoRoundConfig" + \ + " or BitsAndBytesConfig, got {type(self.optimization_config)}." self.assistant_model = assistant_model self.serving_config = serving_config diff --git a/intel_extension_for_transformers/neural_chat/docs/advanced_features.md b/intel_extension_for_transformers/neural_chat/docs/advanced_features.md index 4638503dacc..a04ab0a02f1 100644 --- a/intel_extension_for_transformers/neural_chat/docs/advanced_features.md +++ b/intel_extension_for_transformers/neural_chat/docs/advanced_features.md @@ -141,9 +141,10 @@ Compared to normal quantization like W8A8, weight only quantization is probably ```python # Python code from intel_extension_for_transformers.neural_chat import build_chatbot, PipelineConfig +from intel_extension_for_transformers.transformers import RtnConfig loading_config = LoadingModelConfig(use_neural_speed=True) config = PipelineConfig( - optimization_config=WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4_fullrange") + optimization_config=RtnConfig(bits=4, compute_dtype="int8", weight_dtype="int4_fullrange") ) chatbot = build_chatbot(config) response = chatbot.predict("Tell me about Intel Xeon Scalable Processors.") @@ -156,9 +157,10 @@ response = chatbot.predict("Tell me about Intel Xeon Scalable Processors.") # Python code from intel_extension_for_transformers.neural_chat import build_chatbot, PipelineConfig from intel_extension_for_transformers.neural_chat.config import LoadingModelConfig +from intel_extension_for_transformers.transformers import RtnConfig loading_config = LoadingModelConfig(use_neural_speed=True) config = PipelineConfig( - optimization_config=WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4"), + optimization_config=RtnConfig(bits=4, compute_dtype="int8", weight_dtype="int4"), loading_config=loading_config ) chatbot = build_chatbot(config) diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_talkingbot_on_pc.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_talkingbot_on_pc.ipynb index be4cbc601c6..fcc951f9247 100644 --- a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_talkingbot_on_pc.ipynb +++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_talkingbot_on_pc.ipynb @@ -126,9 +126,9 @@ "outputs": [], "source": [ "from intel_extension_for_transformers.neural_chat import build_chatbot, PipelineConfig\n", - "from intel_extension_for_transformers.transformers import WeightOnlyQuantConfig\n", + "from intel_extension_for_transformers.transformers import RtnConfig\n", "from intel_extension_for_transformers.neural_chat.config import LoadingModelConfig\n", - "config = PipelineConfig(optimization_config=WeightOnlyQuantConfig(compute_dtype=\"int8\", weight_dtype=\"int4\"), \n", + "config = PipelineConfig(optimization_config=RtnConfig(bits=4, compute_dtype=\"int8\", weight_dtype=\"int4\"), \n", " loading_config=LoadingModelConfig(use_neural_speed=True),\n", " model_name_or_path=\"meta-llama/Llama-2-7b-chat-hf\")\n", "chatbot = build_chatbot(config)\n", diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/itrex_llm_graph_int4_optimization_on_spr.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/itrex_llm_graph_int4_optimization_on_spr.ipynb index 26088861134..c51b65b46cf 100644 --- a/intel_extension_for_transformers/neural_chat/docs/notebooks/itrex_llm_graph_int4_optimization_on_spr.ipynb +++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/itrex_llm_graph_int4_optimization_on_spr.ipynb @@ -104,9 +104,9 @@ "outputs": [], "source": [ "from intel_extension_for_transformers.neural_chat import build_chatbot, PipelineConfig\n", - "from intel_extension_for_transformers.transformers import WeightOnlyQuantConfig\n", + "from intel_extension_for_transformers.transformers import RtnConfig\n", "from intel_extension_for_transformers.neural_chat.config import LoadingModelConfig\n", - "config = PipelineConfig(optimization_config=WeightOnlyQuantConfig(compute_dtype=\"int8\", weight_dtype=\"int4\"), \n", + "config = PipelineConfig(optimization_config=RtnConfig(bits=4, compute_dtype=\"int8\", weight_dtype=\"int4\"), \n", " loading_config=LoadingModelConfig(use_neural_speed=True),\n", " model_name_or_path='meta-llama/Llama-2-7b-chat-hf')\n", "chatbot = build_chatbot(config)\n", diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/transformers_extension_api.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/transformers_extension_api.ipynb index 939ea2dc518..0a077bc390b 100644 --- a/intel_extension_for_transformers/neural_chat/docs/notebooks/transformers_extension_api.ipynb +++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/transformers_extension_api.ipynb @@ -96,11 +96,11 @@ "outputs": [], "source": [ "from transformers import AutoTokenizer, TextStreamer\n", - "from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig\n", + "from intel_extension_for_transformers.transformers import AutoModelForCausalLM, GPTQConfig\n", "\n", "# Download Hugging Face GPTQ model to local path\n", "model_name = \"PATH_TO_MODEL\" # local path to model\n", - "woq_config = WeightOnlyQuantConfig(use_gptq=True)\n", + "woq_config = GPTQConfig(bits=4)\n", "prompt = \"Once upon a time, a little girl\"\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n", diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/workshop/01_quickstart_neuralchat.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/workshop/01_quickstart_neuralchat.ipynb index a163e728502..50064c4a45a 100644 --- a/intel_extension_for_transformers/neural_chat/docs/notebooks/workshop/01_quickstart_neuralchat.ipynb +++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/workshop/01_quickstart_neuralchat.ipynb @@ -155,10 +155,10 @@ "source": [ "# Build chatbot with INT4 weight-only quantization, computations in AMX INT8\n", "from intel_extension_for_transformers.neural_chat import build_chatbot, PipelineConfig\n", - "from intel_extension_for_transformers.transformers import WeightOnlyQuantConfig\n", + "from intel_extension_for_transformers.transformers import RtnConfig\n", "from intel_extension_for_transformers.neural_chat.config import LoadingModelConfig\n", "config = PipelineConfig(model_name_or_path=\"Intel/neural-chat-7b-v3-1\",\n", - " optimization_config=WeightOnlyQuantConfig(compute_dtype=\"int8\", weight_dtype=\"int4_fullrange\"), \n", + " optimization_config=RtnConfig(bits=4, compute_dtype=\"int8\", weight_dtype=\"int4_fullrange\"), \n", " loading_config=LoadingModelConfig(use_neural_speed=False))\n", "chatbot = build_chatbot(config)\n", "\n", diff --git a/intel_extension_for_transformers/neural_chat/examples/deployment/talkingbot/pc/build_talkingbot_on_pc.ipynb b/intel_extension_for_transformers/neural_chat/examples/deployment/talkingbot/pc/build_talkingbot_on_pc.ipynb index ccc75c67941..4336028c99d 100644 --- a/intel_extension_for_transformers/neural_chat/examples/deployment/talkingbot/pc/build_talkingbot_on_pc.ipynb +++ b/intel_extension_for_transformers/neural_chat/examples/deployment/talkingbot/pc/build_talkingbot_on_pc.ipynb @@ -132,12 +132,12 @@ "metadata": {}, "outputs": [], "source": [ - "from intel_extension_for_transformers.transformers import WeightOnlyQuantConfig\n", + "from intel_extension_for_transformers.transformers import RtnConfig\n", "from transformers import AutoTokenizer, TextStreamer\n", "from intel_extension_for_transformers.transformers import AutoModel\n", "\n", "model_name = \"meta-llama/Llama-2-7b-chat-hf\" # Please first download the model and replace this model_name with the local path\n", - "woq_config = WeightOnlyQuantConfig(compute_type=\"int8\", weight_dtype=\"int4\")\n", + "woq_config = RtnConfig(bits=4, compute_type=\"int8\", weight_dtype=\"int4\")\n", "prompt = \"Who is andy grove\"\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n", diff --git a/intel_extension_for_transformers/neural_chat/models/model_utils.py b/intel_extension_for_transformers/neural_chat/models/model_utils.py index 88e3f3eba06..fbd3304e46a 100644 --- a/intel_extension_for_transformers/neural_chat/models/model_utils.py +++ b/intel_extension_for_transformers/neural_chat/models/model_utils.py @@ -52,7 +52,11 @@ from transformers.utils import is_bitsandbytes_available, is_offline_mode from intel_extension_for_transformers.transformers import ( MixedPrecisionConfig, - WeightOnlyQuantConfig, + RtnConfig, + AwqConfig, + TeqConfig, + GPTQConfig, + AutoRoundConfig, BitsAndBytesConfig ) from intel_extension_for_transformers.neural_chat.errorcode import ErrorCodes @@ -560,12 +564,14 @@ def load_model( load_to_meta = model_on_meta(config) - if isinstance(optimization_config, WeightOnlyQuantConfig): + if isinstance(optimization_config, (RtnConfig, AwqConfig, TeqConfig, GPTQConfig, AutoRoundConfig)): from intel_extension_for_transformers.neural_chat.chatbot import optimize_model if use_neural_speed: optimization_config.post_init_runtime() - else: - optimization_config.post_init() + elif device == "cpu": + optimization_config.post_init_cpu() + elif device == "xpu": + optimization_config.post_init_xpu() model = optimize_model(model_name, optimization_config, use_neural_speed) if hasattr(model, 'config'): if model.config.is_encoder_decoder: diff --git a/intel_extension_for_transformers/neural_chat/server/neuralchat_server.py b/intel_extension_for_transformers/neural_chat/server/neuralchat_server.py index f8cedbf1155..664f032554e 100644 --- a/intel_extension_for_transformers/neural_chat/server/neuralchat_server.py +++ b/intel_extension_for_transformers/neural_chat/server/neuralchat_server.py @@ -248,16 +248,17 @@ def init(self, config): loading_config = LoadingModelConfig(ipex_int8=ipex_int8, use_neural_speed=use_neural_speed, peft_path=peft_model_path, use_deepspeed=use_deepspeed, world_size=world_size, gguf_model_path=gguf_model_path) - from intel_extension_for_transformers.transformers import WeightOnlyQuantConfig, MixedPrecisionConfig + from intel_extension_for_transformers.transformers import RtnConfig, AwqConfig, TeqConfig, GPTQConfig, \ + AutoRoundConfig, MixedPrecisionConfig if optimization_type == "weight_only": if use_gptq: - optimization_config = WeightOnlyQuantConfig(use_gptq=use_gptq) + optimization_config = GPTQConfig(bits=4) elif use_awq: - optimization_config = WeightOnlyQuantConfig(use_gptq=use_awq) + optimization_config = AwqConfig(bits=4) elif use_autoround: - optimization_config = WeightOnlyQuantConfig(use_gptq=use_autoround) + optimization_config = AutoRoundConfig(bits=4) else: - optimization_config = WeightOnlyQuantConfig(compute_dtype=compute_dtype, weight_dtype=weight_dtype, + optimization_config = RtnConfig(bits=4, compute_dtype=compute_dtype, weight_dtype=weight_dtype, use_ggml=use_ggml, use_cache=use_cached_bin) elif optimization_type == "mix_precision": optimization_config = MixedPrecisionConfig(dtype=mix_precision_dtype) diff --git a/intel_extension_for_transformers/neural_chat/tests/ci/api/test_chatbot_exception.py b/intel_extension_for_transformers/neural_chat/tests/ci/api/test_chatbot_exception.py index fcadecfccce..866a747338d 100644 --- a/intel_extension_for_transformers/neural_chat/tests/ci/api/test_chatbot_exception.py +++ b/intel_extension_for_transformers/neural_chat/tests/ci/api/test_chatbot_exception.py @@ -28,7 +28,7 @@ TextGenerationFinetuningConfig, ) from intel_extension_for_transformers.neural_chat.chatbot import finetune_model, optimize_model -from intel_extension_for_transformers.transformers import MixedPrecisionConfig, WeightOnlyQuantConfig +from intel_extension_for_transformers.transformers import MixedPrecisionConfig, RtnConfig from intel_extension_for_transformers.transformers import BitsAndBytesConfig from intel_extension_for_transformers.neural_chat.errorcode import ErrorCodes from intel_extension_for_transformers.neural_chat.utils.error_utils import get_latest_error @@ -502,7 +502,7 @@ def test_amp_optimize_fail(self,mock_optimize): @unittest.skipIf(get_device_type() != 'cpu', "Only run this test on CPU") @patch('intel_extension_for_transformers.llm.quantization.optimization.Optimization.optimize') def test_weight_only_quant_optimize_fail(self,mock_optimize): - config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4") + config = RtnConfig(bits=4, compute_dtype="int8", weight_dtype="int4") model = AutoModelForCausalLM.from_pretrained( "facebook/opt-125m", low_cpu_mem_usage=True, diff --git a/intel_extension_for_transformers/neural_chat/tests/ci/models/test_model_utils.py b/intel_extension_for_transformers/neural_chat/tests/ci/models/test_model_utils.py index 02c4c64ebf8..f74eee3864a 100644 --- a/intel_extension_for_transformers/neural_chat/tests/ci/models/test_model_utils.py +++ b/intel_extension_for_transformers/neural_chat/tests/ci/models/test_model_utils.py @@ -20,7 +20,7 @@ import shutil from unittest import mock from intel_extension_for_transformers.neural_chat.models.model_utils import load_model, MODELS, predict -from intel_extension_for_transformers.transformers import MixedPrecisionConfig, BitsAndBytesConfig, WeightOnlyQuantConfig +from intel_extension_for_transformers.transformers import MixedPrecisionConfig, BitsAndBytesConfig, RtnConfig from intel_extension_for_transformers.neural_chat.utils.common import get_device_type from intel_extension_for_transformers.neural_chat.utils.error_utils import clear_latest_error, get_latest_error from intel_extension_for_transformers.neural_chat.errorcode import ErrorCodes @@ -127,14 +127,14 @@ def test_model_optimization_bitsandbytes(self): @unittest.skipIf(get_device_type() != 'cpu', "Only run this test on CPU") def test_model_optimization_weightonly_llmruntime(self): - config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4") + config = RtnConfig(bits=4, compute_dtype="int8", weight_dtype="int4") load_model(model_name="facebook/opt-125m", tokenizer_name="facebook/opt-125m", device="cpu", optimization_config=config, use_neural_speed=True) self.assertTrue("facebook/opt-125m" in MODELS) self.assertTrue(MODELS["facebook/opt-125m"]["model"] is not None) @unittest.skipIf(get_device_type() != 'cpu', "Only run this test on CPU") def test_model_optimization_weightonly(self): - config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4_fullrange") + config = RtnConfig(bits=4, compute_dtype="int8", weight_dtype="int4_fullrange") load_model(model_name="facebook/opt-125m", tokenizer_name="facebook/opt-125m", device="cpu", optimization_config=config) self.assertTrue("facebook/opt-125m" in MODELS) self.assertTrue(MODELS["facebook/opt-125m"]["model"] is not None) diff --git a/intel_extension_for_transformers/neural_chat/tests/ci/optimization/test_optimization.py b/intel_extension_for_transformers/neural_chat/tests/ci/optimization/test_optimization.py index bb90841d71d..f2949a2a3bd 100644 --- a/intel_extension_for_transformers/neural_chat/tests/ci/optimization/test_optimization.py +++ b/intel_extension_for_transformers/neural_chat/tests/ci/optimization/test_optimization.py @@ -23,7 +23,7 @@ from intel_extension_for_transformers.neural_chat import build_chatbot from intel_extension_for_transformers.neural_chat.config import PipelineConfig from intel_extension_for_transformers.neural_chat.config import LoadingModelConfig -from intel_extension_for_transformers.transformers import WeightOnlyQuantConfig, MixedPrecisionConfig +from intel_extension_for_transformers.transformers import RtnConfig, MixedPrecisionConfig from intel_extension_for_transformers.neural_chat.utils.common import get_device_type class TestChatbotBuilder(unittest.TestCase): @@ -80,7 +80,7 @@ def test_build_chatbot_with_weight_only_quant(self): if self.device == "cpu": loading_config = LoadingModelConfig(use_neural_speed=False) config = PipelineConfig(model_name_or_path="facebook/opt-125m", - optimization_config=WeightOnlyQuantConfig(compute_dtype="fp32", weight_dtype="int4_fullrange"), + optimization_config=RtnConfig(bits=4, compute_dtype="fp32", weight_dtype="int4_fullrange"), loading_config=loading_config ) chatbot = build_chatbot(config) diff --git a/intel_extension_for_transformers/neural_chat/tests/ci/optimization/test_optimization_llmruntime.py b/intel_extension_for_transformers/neural_chat/tests/ci/optimization/test_optimization_llmruntime.py index 82c63bb7c55..c27e0e5a995 100644 --- a/intel_extension_for_transformers/neural_chat/tests/ci/optimization/test_optimization_llmruntime.py +++ b/intel_extension_for_transformers/neural_chat/tests/ci/optimization/test_optimization_llmruntime.py @@ -21,7 +21,7 @@ from intel_extension_for_transformers.neural_chat import build_chatbot from intel_extension_for_transformers.neural_chat.config import PipelineConfig from intel_extension_for_transformers.neural_chat.config import LoadingModelConfig -from intel_extension_for_transformers.transformers import WeightOnlyQuantConfig +from intel_extension_for_transformers.transformers import RtnConfig from intel_extension_for_transformers.neural_chat.utils.common import get_device_type class TestChatbotBuilder(unittest.TestCase): @@ -45,7 +45,7 @@ def tearDown(self) -> None: def test_build_chatbot_with_llm_runtime(self): loading_config = LoadingModelConfig(use_neural_speed=True) config = PipelineConfig(model_name_or_path="facebook/opt-125m", - optimization_config=WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int8"), + optimization_config=RtnConfig(bits=4, compute_dtype="int8", weight_dtype="int8"), loading_config=loading_config ) chatbot = build_chatbot(config) diff --git a/intel_extension_for_transformers/neural_chat/tests/ci/server/test_itrex_int4_textchat_server.py b/intel_extension_for_transformers/neural_chat/tests/ci/server/test_itrex_int4_textchat_server.py index dbf3c259b59..27416aade84 100644 --- a/intel_extension_for_transformers/neural_chat/tests/ci/server/test_itrex_int4_textchat_server.py +++ b/intel_extension_for_transformers/neural_chat/tests/ci/server/test_itrex_int4_textchat_server.py @@ -22,7 +22,7 @@ from intel_extension_for_transformers.neural_chat import build_chatbot from intel_extension_for_transformers.neural_chat import PipelineConfig from intel_extension_for_transformers.neural_chat.config import LoadingModelConfig -from intel_extension_for_transformers.transformers import WeightOnlyQuantConfig +from intel_extension_for_transformers.transformers import RtnConfig from intel_extension_for_transformers.neural_chat.utils.common import get_device_type from intel_extension_for_transformers.neural_chat.server.restful.textchat_api import router from intel_extension_for_transformers.neural_chat.server.restful.openai_protocol import ChatCompletionRequest, ChatCompletionResponse @@ -37,7 +37,7 @@ def setUp(self) -> None: if device != "cpu": self.skipTest("Only test this UT case on Intel CPU.") loading_config = LoadingModelConfig(use_neural_speed=False) - optimization_config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4_fullrange") + optimization_config = RtnConfig(bits=4, compute_dtype="int8", weight_dtype="int4_fullrange") config = PipelineConfig(model_name_or_path="facebook/opt-125m", device="cpu", loading_config=loading_config, optimization_config=optimization_config) diff --git a/intel_extension_for_transformers/neural_chat/tests/ci/server/test_itrex_llm_runtime_int4_server.py b/intel_extension_for_transformers/neural_chat/tests/ci/server/test_itrex_llm_runtime_int4_server.py index 4e95ef7d886..da8d29c9cbd 100644 --- a/intel_extension_for_transformers/neural_chat/tests/ci/server/test_itrex_llm_runtime_int4_server.py +++ b/intel_extension_for_transformers/neural_chat/tests/ci/server/test_itrex_llm_runtime_int4_server.py @@ -22,7 +22,7 @@ from intel_extension_for_transformers.neural_chat import build_chatbot from intel_extension_for_transformers.neural_chat import PipelineConfig from intel_extension_for_transformers.neural_chat.config import LoadingModelConfig -from intel_extension_for_transformers.transformers import WeightOnlyQuantConfig +from intel_extension_for_transformers.transformers import RtnConfig from intel_extension_for_transformers.neural_chat.utils.common import get_device_type from intel_extension_for_transformers.neural_chat.server.restful.textchat_api import router from intel_extension_for_transformers.neural_chat.server.restful.openai_protocol import ChatCompletionRequest @@ -37,7 +37,7 @@ def setUp(self) -> None: if device != "cpu": self.skipTest("Only test this UT case on Intel CPU.") loading_config = LoadingModelConfig(use_neural_speed=True) - optimization_config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4") + optimization_config = RtnConfig(bits=4, compute_dtype="int8", weight_dtype="int4") config = PipelineConfig(model_name_or_path="facebook/opt-125m", device="cpu", loading_config=loading_config, optimization_config=optimization_config) diff --git a/intel_extension_for_transformers/neural_chat/tests/nightly/models/test_gptq.py b/intel_extension_for_transformers/neural_chat/tests/nightly/models/test_gptq.py index 543a43e19d3..aaf51fb0ac2 100644 --- a/intel_extension_for_transformers/neural_chat/tests/nightly/models/test_gptq.py +++ b/intel_extension_for_transformers/neural_chat/tests/nightly/models/test_gptq.py @@ -16,7 +16,7 @@ from intel_extension_for_transformers.neural_chat import build_chatbot, PipelineConfig from intel_extension_for_transformers.neural_chat.config import LoadingModelConfig -from intel_extension_for_transformers.transformers import WeightOnlyQuantConfig +from intel_extension_for_transformers.transformers import GPTQConfig from intel_extension_for_transformers.neural_chat.utils.common import get_device_type import unittest @@ -32,7 +32,7 @@ def test_code_gen_with_gguf(self): if self.device == "hpu": self.skipTest("GTPQ is not supported on HPU.") loading_config = LoadingModelConfig(use_neural_speed=True) - optimization_config = WeightOnlyQuantConfig(use_gptq=True) + optimization_config = GPTQConfig(bits=4) config = PipelineConfig(model_name_or_path="/tf_dataset2/models/nlp_toolkit/Llama-2-7B-Chat-GPTQ", optimization_config=optimization_config, loading_config=loading_config) diff --git a/intel_extension_for_transformers/neural_chat/tests/nightly/models/test_model_utils.py b/intel_extension_for_transformers/neural_chat/tests/nightly/models/test_model_utils.py index b5f8ce9a8d1..d38d5dfbc5e 100644 --- a/intel_extension_for_transformers/neural_chat/tests/nightly/models/test_model_utils.py +++ b/intel_extension_for_transformers/neural_chat/tests/nightly/models/test_model_utils.py @@ -22,7 +22,7 @@ import shutil from unittest import mock from intel_extension_for_transformers.neural_chat.models.model_utils import load_model, MODELS -from intel_extension_for_transformers.transformers import MixedPrecisionConfig, BitsAndBytesConfig, WeightOnlyQuantConfig +from intel_extension_for_transformers.transformers import MixedPrecisionConfig, BitsAndBytesConfig from intel_extension_for_transformers.neural_chat.utils.common import get_device_type class TestModelUtils(unittest.TestCase): diff --git a/intel_extension_for_transformers/transformers/__init__.py b/intel_extension_for_transformers/transformers/__init__.py index 0202e1bdebb..9bb247f7dd0 100644 --- a/intel_extension_for_transformers/transformers/__init__.py +++ b/intel_extension_for_transformers/transformers/__init__.py @@ -44,7 +44,11 @@ MixedPrecisionConfig, BitsAndBytesConfig, SmoothQuantConfig, - WeightOnlyQuantConfig, + RtnConfig, + AwqConfig, + TeqConfig, + GPTQConfig, + AutoRoundConfig, metrics, objectives, ) diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index 9d60e639943..395e0ec2d50 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -31,6 +31,7 @@ # limitations under the License. import json +import copy import os import re import torch @@ -41,7 +42,11 @@ BitsAndBytesConfig, MixedPrecisionConfig, SmoothQuantConfig, - WeightOnlyQuantConfig, + RtnConfig, + AwqConfig, + TeqConfig, + GPTQConfig, + AutoRoundConfig, logger, LazyImport, ) @@ -63,6 +68,7 @@ replace_linear ) from transformers.configuration_utils import PretrainedConfig +from transformers import AutoConfig from transformers.utils import is_accelerate_available, is_bitsandbytes_available from typing import Union @@ -85,6 +91,7 @@ def convert_model_to_public(model): def save_low_bit( self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs ): + assert hasattr( self, "quantization_config" ), f"Detected this model is not a low-bit model." @@ -141,8 +148,6 @@ def save_low_bit( commit_message=commit_message, token=kwargs.get("token"), ) - - self.quantization_config.low_bit_model = True self.quantization_config.save_pretrained(save_directory, **kwargs) @@ -155,6 +160,7 @@ class _BaseQBitsAutoModelClass: @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + # use for neuralspeed gguf model_file = kwargs.pop("model_file", None) if model_file is not None: from neural_speed import Model @@ -200,46 +206,31 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): device_map = kwargs.get("device_map", "cpu") use_cpu = (True if device_map == torch.device("cpu") or device_map == "cpu" else False) use_xpu = (True if device_map == torch.device("xpu") or device_map == "xpu" else False) - use_neural_speed = False - if not use_xpu: - if kwargs.get("use_llm_runtime", None) is not None: - use_neural_speed = kwargs.pop("use_llm_runtime", True) and not use_xpu - logger.warning("use_llm_runtime is deprecated in version 1.3.2, please use_neural_speed instead.") - elif kwargs.get("use_neural_speed", None) is not None: - use_neural_speed = kwargs.pop("use_neural_speed", True) and not use_xpu - else: - config = transformers.AutoConfig.from_pretrained(pretrained_model_name_or_path, - trust_remote_code=kwargs.get("trust_remote_code", False)) - if hasattr(config, "model_type") == False: - logger.error("Can't get the model_type. Please check the correct model_type") - exit(0) - if config.model_type in cls.model_type_list: - logger.info("Using Neural Speed...") - use_neural_speed = True + config = kwargs.pop("config", None) - if os.path.isfile(os.path.join(pretrained_model_name_or_path, QUANT_CONFIG)): - logger.info( - "Find quantization_config.json, trying to load quantized low bit model..." - ) - quantization_config = WeightOnlyQuantConfig.from_pretrained( + if not isinstance(config, PretrainedConfig): + config, _ = AutoConfig.from_pretrained( pretrained_model_name_or_path, - _configuration_file=QUANT_CONFIG, + return_unused_kwargs=True, **kwargs, + ) - if quantization_config is None or quantization_config.low_bit_model != True: + if hasattr(config, "quantization_config"): + if config.quantization_config is None: logger.warning("Quantization_config loading failed. If you want to load saved " - "low bit model, please check your quantization_config.json.") + "low bit model, please check your quantizate_config.json.") else: logger.info( "quantization_config: {}".format( - quantization_config.to_json_string() + config.quantization_config ) ) try: kwargs["device_map"] = \ - quantization_config.device if hasattr(quantization_config, "device") else "auto" - model = cls.load_low_bit(pretrained_model_name_or_path, *model_args, **kwargs) + config.quantization_config["device"] if "device" in config.quantization_config.keys() \ + else "auto" + model = cls.load_low_bit(pretrained_model_name_or_path, *model_args, config=config, **kwargs) logger.info("Saved low bit model loading successfully. Other input args " "will be ignored.") return model @@ -247,7 +238,22 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): logger.error(e) logger.error("Saved low bit model loading failed, please check your model.") exit(0) + if kwargs.get("use_llm_runtime", None) is not None: + use_neural_speed = kwargs.pop("use_llm_runtime", True) and not use_xpu + logger.warning("use_llm_runtime is deprecated in version 1.3.2, please use_neural_speed instead.") + elif kwargs.get("use_neural_speed", None) is not None: + use_neural_speed = kwargs.pop("use_neural_speed", True) and not use_xpu + else: + if hasattr(config, "model_type") == False: + logger.error("Can't get the model_type. Please check the correct model_type") + exit(0) + if config.model_type in cls.model_type_list and not use_xpu: + logger.info("Using Neural Speed...") + use_neural_speed = True + else: + logger.info("Using Pytorch...") + use_neural_speed = False import intel_extension_for_transformers.transformers.modeling.modeling_map @@ -258,8 +264,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): if isinstance(quantization_config, BitsAndBytesConfig): model = cls.ORIG_MODEL.from_pretrained( pretrained_model_name_or_path, - quantization_config=quantization_config, *model_args, + config=config, + quantization_config=quantization_config, **kwargs, ) return model @@ -267,10 +274,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): if (is_accelerate_available() and is_bitsandbytes_available() and not use_cpu and not use_xpu): model = cls.ORIG_MODEL.from_pretrained( pretrained_model_name_or_path, + *model_args, + config=config, quantization_config=quantization_config, load_in_4bit=load_in_4bit, load_in_8bit=load_in_8bit, - *model_args, **kwargs, ) logger.info("WeightOnlyQuant bitsandbytes done.") @@ -286,9 +294,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): if quantization_config is None: if use_neural_speed: # use wnf4_sfp32_cfp32_g32_sym by default - quantization_config = WeightOnlyQuantConfig(compute_dtype="fp32", weight_dtype="nf4") + quantization_config = RtnConfig(compute_dtype="fp32", weight_dtype="nf4") else: - quantization_config = WeightOnlyQuantConfig(compute_dtype=convert_dtype_torch2str(torch_dtype), + quantization_config = RtnConfig(bits=4, compute_dtype=convert_dtype_torch2str(torch_dtype), weight_dtype="nf4" if use_cpu else "int4_fullrange") else: assert ("4" in quantization_config.weight_dtype @@ -298,9 +306,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): elif load_in_8bit: if quantization_config is None: if use_neural_speed: - quantization_config = WeightOnlyQuantConfig(compute_dtype="bf16", weight_dtype="int8") + quantization_config = RtnConfig(compute_dtype="bf16", weight_dtype="int8") else: - quantization_config = WeightOnlyQuantConfig(compute_dtype=convert_dtype_torch2str(torch_dtype), + quantization_config = RtnConfig(bits=8, compute_dtype=convert_dtype_torch2str(torch_dtype), weight_dtype="int8") else: assert ( @@ -321,19 +329,21 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): "will fall to traditional load method with higher memory consumption." ) kwargs["low_cpu_mem_usage"] = False - model = cls.ORIG_MODEL.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + model = cls.ORIG_MODEL.from_pretrained(pretrained_model_name_or_path, + *model_args, + config=config, + **kwargs) model.config.update({"low_cpu_mem_usage": False}) model = model.to("cpu") model.config.update({"device": "cpu"}) model.eval() logger.info("Mixed Precision done.") - elif isinstance(quantization_config, WeightOnlyQuantConfig): + elif isinstance(quantization_config, (RtnConfig, AwqConfig, TeqConfig, GPTQConfig, AutoRoundConfig)): logger.info("Applying Weight Only Quantization.") if use_neural_speed: logger.info("Using LLM runtime.") quantization_config.post_init_runtime() from neural_speed import Model - model = Model() model.init( pretrained_model_name_or_path, @@ -343,11 +353,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): scale_dtype=quantization_config.scale_dtype, compute_dtype=quantization_config.compute_dtype, use_ggml=quantization_config.use_ggml, - use_quant=quantization_config.use_quant, - use_gptq=quantization_config.use_gptq or \ - quantization_config.algorithm.upper() == "GPTQ" or \ - quantization_config.use_autoround, - use_awq=quantization_config.algorithm.upper() == "AWQ", + use_quant=quantization_config.use_quant if hasattr(quantization_config, "use_quant") else False, + use_gptq=quantization_config.quant_method.value == "gptq" or \ + quantization_config.quant_method.value =="autoround", + use_awq=quantization_config.quant_method.value == "awq", ) model.quantization_config = quantization_config return model @@ -355,13 +364,12 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): if use_xpu: # TODO: if low_cpu_mem_uasge is True, gptj will have accuracy issue on CPU device. kwargs["low_cpu_mem_usage"] = True - kwargs["device_map"] = "auto" + kwargs["device_map"] = "cpu" try: model = cls.ORIG_MODEL.from_pretrained( pretrained_model_name_or_path, - torchscript=True - if quantization_config.algorithm in ["TEQ", "AWQ"] and not use_xpu else False, *model_args, + config=config, **kwargs, ) model.config.update({"low_cpu_mem_usage": True}) @@ -371,33 +379,35 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): kwargs["low_cpu_mem_usage"] = False model = cls.ORIG_MODEL.from_pretrained( pretrained_model_name_or_path, - torchscript=True - if quantization_config.algorithm in ["TEQ", "AWQ"] and not use_xpu else False, *model_args, + config=config, **kwargs, ) model.config.update({"low_cpu_mem_usage": False}) else: model = cls.ORIG_MODEL.from_pretrained( pretrained_model_name_or_path, - torchscript=True if quantization_config.algorithm in ["TEQ", "AWQ"] and not use_xpu else False, *model_args, + config=config, **kwargs, ) model.eval() - quantization_config.update({"device": "cpu"}) + + quantization_config.update(**{"device": "cpu"}) if use_xpu: import intel_extension_for_pytorch assert hasattr(torch, "xpu") and torch.xpu.is_available(), "There is no xpu device in this system!" - quantization_config.update({"device": "xpu"}) + quantization_config.update(**{"device": "xpu"}) if (not torch.cuda.is_available() or device_map == "cpu" or device_map == torch.device("cpu")) and model.config.model_type == "chatglm": model = model.float() if use_cpu: - quantization_config.post_init() + quantization_config.post_init_cpu() elif use_xpu: quantization_config.post_init_xpu() model = convert_to_quantized_model(model, quantization_config, device=device_map) + quantization_config.tokenizer = None + model.config.quantization_config = quantization_config # add quantization_config and save_low_bit to pretrained model dynamically model.device_map = device_map @@ -411,13 +421,15 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): except ImportError: logger.warning("Please install Intel Extension for PyTorch to accelerate the model inference.") assert (ipex.__version__ >= "2.2.0+cpu"), "Please use Intel Extension for PyTorch >=2.2.0+cpu." + + config.torchscript = True + config.use_cache = True model = cls.ORIG_MODEL.from_pretrained( pretrained_model_name_or_path, + *model_args, + config=config, low_cpu_mem_usage=True, torch_dtype=torch.float, - torchscript=True, - use_cache=True, - *model_args, **kwargs, ) @@ -646,7 +658,9 @@ def calib_func(model): ) logger.info("SmoothQuant done.") else: - model = cls.ORIG_MODEL.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + model = cls.ORIG_MODEL.from_pretrained( + pretrained_model_name_or_path, *model_args, config=config, **kwargs + ) if (not torch.cuda.is_available() or device_map == "cpu" or device_map == torch.device("cpu")) and model.config.model_type == "chatglm": model = model.float() @@ -688,7 +702,7 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): # Autofactory kwargs_orig = copy.deepcopy(kwargs) # modules_to_not_convert = kwargs.pop("modules_to_not_convert", None) - trust_remote_code = kwargs.get("trust_remote_code", None) + trust_remote_code = kwargs.pop("trust_remote_code", None) # Maybe needed when extract_local_archive_file subfolder = kwargs.get("subfolder", "") variant = kwargs.get("variant", None) @@ -719,20 +733,20 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): # if torch_dtype=auto was passed here, ensure to pass it on if kwargs_orig.get("torch_dtype", None) == "auto": kwargs["torch_dtype"] = "auto" - - quantization_config = WeightOnlyQuantConfig.from_pretrained( - pretrained_model_name_or_path, - _configuration_file=QUANT_CONFIG, - **kwargs, - ) + config = kwargs.pop("config", None) + quantization_config = config.quantization_config + if quantization_config["quant_method"] == "rtn": + quantization_config = RtnConfig.from_dict(quantization_config) + elif quantization_config["quant_method"] == "awq": + quantization_config = AwqConfig.from_dict(quantization_config) + elif quantization_config["quant_method"] == "teq": + quantization_config = TeqConfig.from_dict(quantization_config) + elif quantization_config["quant_method"] == "gptq": + quantization_config = GPTQConfig.from_dict(quantization_config) + elif quantization_config["quant_method"] == "autoround": + quantization_config = AutoRoundConfig.from_dict(quantization_config) assert (quantization_config is not None), "Detect this model is not a low-bit model." - kwargs["trust_remote_code"] = trust_remote_code - config, kwargs = AutoConfig.from_pretrained( - pretrained_model_name_or_path, - return_unused_kwargs=True, - **kwargs, - ) if commit_hash is None: if not isinstance(config, PretrainedConfig): @@ -756,8 +770,7 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): else: commit_hash = getattr(config, "_commit_hash", None) - config_dict, _ = PretrainedConfig.get_config_dict(pretrained_model_name_or_path) - low_cpu_mem_usage = config_dict.pop("low_cpu_mem_usage", True) + low_cpu_mem_usage = (hasattr(config, "low_cpu_mem_usage") and config.low_cpu_mem_usage) has_remote_code = (hasattr(config, "auto_map") and cls.ORIG_MODEL.__name__ in config.auto_map) diff --git a/intel_extension_for_transformers/transformers/utils/__init__.py b/intel_extension_for_transformers/transformers/utils/__init__.py index f17bfabf545..81e9f1d19bf 100644 --- a/intel_extension_for_transformers/transformers/utils/__init__.py +++ b/intel_extension_for_transformers/transformers/utils/__init__.py @@ -22,6 +22,10 @@ BitsAndBytesConfig, SmoothQuantConfig, SparsityConfig, - WeightOnlyQuantConfig, + RtnConfig, + AwqConfig, + TeqConfig, + GPTQConfig, + AutoRoundConfig ) from .utility import LazyImport, logger, str2bool, CpuInfo diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py index 1fee8a9b325..4c958f0eb2c 100644 --- a/intel_extension_for_transformers/transformers/utils/config.py +++ b/intel_extension_for_transformers/transformers/utils/config.py @@ -20,272 +20,70 @@ import json import os from dataclasses import dataclass, field -from typing import Any, Dict, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union from .utility import QUANT_CONFIG, SPARSITY_CONFIG, LazyImport, logger from transformers import BitsAndBytesConfig, PretrainedConfig torch = LazyImport("torch") -class WeightOnlyQuantConfig(PretrainedConfig): +@dataclass +class MixedPrecisionConfig: + dtype: str = "bfloat16" + + +@dataclass +class SmoothQuantConfig: + backend: str = "ipex" + ipex_opt_llm: bool = None + tokenizer: Any = None + calib_func: Any = None + calib_dataset: str = "NeelNanda/pile-10k" + calib_shuffle: bool = True + calib_iters: int = 100 + calib_padding: bool = False + calib_len: int = 512 + calib_pad_val: int = 1 + alpha: float = 0.5 + op_type_dict: dict = None + op_name_dict: dict = None + excluded_precisions: list = field(default_factory=list) + example_inputs: Any = None + num_beams: int = 1 + recipes: dict = field( + default_factory=lambda: { + "smooth_quant": True, + "smooth_quant_args": {"alpha": 0.5}, + } + ) + +class SparsityConfig(PretrainedConfig): def __init__( self, - llm_int8_skip_modules=None, - compute_dtype=None, - weight_dtype=None, - scale_dtype=None, - mse_range=False, # only for RTN and AWQ - use_double_quant=False, - double_quant_scale_dtype=None, # reserve for double quant - group_size=32, - scheme="sym", - algorithm="RTN", - use_ggml=False, - use_quant=True, - use_gptq=False, - use_autoround=False, - algorithm_args=None, - use_neural_speed=True, - low_bit_model=False, + sparse_pattern: str = "1x1", + sparse_dtype=None, + sparse_layers=None, + dense_layers: list = ["lm_head"], + group_size=None, **kwargs, ): - from intel_extension_for_transformers.llm.quantization.utils import ( - convert_dtype_torch2str, ) - - self.llm_int8_skip_modules = (llm_int8_skip_modules if llm_int8_skip_modules else []) - self.weight_dtype = weight_dtype - self.mse_range = mse_range - self.use_double_quant = use_double_quant - self.scheme = scheme - self.algorithm = algorithm + self.sparse_pattern = sparse_pattern + self.sparse_dtype = sparse_dtype + self.sparse_layers = sparse_layers + self.dense_layers = dense_layers self.group_size = group_size - self.tokenizer = kwargs.pop("tokenizer", None) - self.calib_func = kwargs.pop("calib_func", None) - self.calib_dataset = kwargs.pop("calib_dataset", "NeelNanda/pile-10k") - self.calib_dataloader = kwargs.pop("calib_dataloader", None) - self.calib_iters = kwargs.pop("calib_iters", 100) - self.use_ggml = use_ggml - self.use_quant = use_quant - self.use_gptq = use_gptq - self.use_autoround = use_autoround - self.algorithm_args = algorithm_args - self.use_neural_speed = use_neural_speed - self.low_bit_model = low_bit_model - self.device = kwargs.get("device", "auto") - - if isinstance(compute_dtype, torch.dtype): - self.compute_dtype = convert_dtype_torch2str(compute_dtype) - else: - self.compute_dtype = compute_dtype - - if isinstance(scale_dtype, torch.dtype): - self.scale_dtype = convert_dtype_torch2str(scale_dtype) - else: - self.scale_dtype = scale_dtype - - if isinstance(double_quant_scale_dtype, torch.dtype): - self.double_quant_scale_dtype = convert_dtype_torch2str(double_quant_scale_dtype) - else: - self.double_quant_scale_dtype = double_quant_scale_dtype def post_init(self): r""" Safety checker that arguments are correct - also replaces some NoneType arguments with their default values. """ - - if self.llm_int8_skip_modules is not None and not isinstance(self.llm_int8_skip_modules, list): - raise ValueError("llm_int8_skip_modules must be a list of strings") - - if self.compute_dtype is not None and self.compute_dtype not in ['fp32', 'bf16', 'int8']: - raise ValueError("compute_dtype must be 'fp32', 'bf16', 'int8'.") - elif self.compute_dtype is None: - self.compute_dtype = "fp32" - - if self.weight_dtype is None: - self.weight_dtype = "nf4" - elif self.weight_dtype not in [ - "int8", - "int4_fullrange", - "int4_clip", - "nf4", - "fp4_e2m1_bnb", - "fp4_e2m1", - "fp8_e5m2", - "fp8_e4m3", - ]: - raise ValueError( - f"weight_dtype must be a string in " - f"'int8', 'int4_fullrange', 'int4_clip', 'nf4', 'fp4_e2m1_bnb', 'fp4_e2m1', 'fp8_e5m2, fp8_e4m3'") - - if self.scale_dtype is not None and self.scale_dtype not in ["fp32", "fp8_e8m0"]: - raise ValueError(f"scale_dtype must be a string in 'fp32', 'fp8_e8m0' " - f"and fp8_e8m0 only used for weight_dtype 'fp8_e5m2', 'fp8_e4m3'") - elif self.scale_dtype is None: - self.scale_dtype = "fp32" - - if not isinstance(self.mse_range, bool): - raise ValueError("mse_range must be a boolean") - - if not isinstance(self.use_double_quant, bool): - raise ValueError("use_double_quant must be a boolean") - - if self.use_double_quant and not isinstance(self.double_quant_dtype, str): - raise ValueError("double_quant_dtype must be a string") - - if self.use_double_quant and not isinstance(self.scale_dtype, str): - raise ValueError("scale_dtype must be a string") - - if not isinstance(self.group_size, int): - raise ValueError("group_size must be a int") - - if not isinstance(self.scheme, str): - raise ValueError("scheme must be a string") - - if self.scheme == "asym" and (self.compute_dtype == "int8" or self.weight_dtype.startswith("fp") \ - or self.weight_dtype.startswith("nf") or self.scale_dtype != "fp32"): - raise ValueError("WeightOnlyQuantization doesn't support asym with \ - compute_dtype int8 or weight_dtype float or scale_dtype non-fp32 now, \ - please use sym scheme") - self.use_neural_speed = False - - def post_init_xpu(self): - r""" - Safety checker that arguments are correct - also replaces some NoneType arguments with their default values. - """ - - if self.llm_int8_skip_modules is not None and not isinstance(self.llm_int8_skip_modules, list): - raise ValueError("llm_int8_skip_modules must be a list of strings") - - if self.compute_dtype is not None and self.compute_dtype not in ["fp16"]: - raise ValueError("compute_dtype must be 'fp16'.") - elif self.compute_dtype is None: - self.compute_dtype = "fp16" - - if self.algorithm not in ["RTN", "GPTQ"]: - raise ValueError("algorithm must be 'RTN' and 'GPTQ' now. will support 'TEQ', 'AWQ' soon!") - - if self.algorithm == "GPTQ": - if self.algorithm_args is not None: - if "actorder" in self.algorithm_args: - assert not self.algorithm_args["actorder"], "GPTQ algorithm only support actorder False now." - - if self.weight_dtype is None: - self.weight_dtype = "int4_fullrange" - elif self.weight_dtype not in [ - "int4_fullrange", - ]: - raise ValueError(f"weight_dtype must be a string in " - f"'int4_fullrange'.") - - if self.scale_dtype is not None and self.scale_dtype not in ["fp16"]: - raise ValueError(f"scale_dtype must be a string in 'fp16'") - elif self.scale_dtype is None: - self.scale_dtype = "fp16" - - if not isinstance(self.mse_range, bool): - raise ValueError("mse_range must be a boolean") - - if not isinstance(self.use_double_quant, bool): - raise ValueError("use_double_quant must be a boolean") - - if self.use_double_quant and not isinstance(self.double_quant_dtype, str): - raise ValueError("double_quant_dtype must be a string") - - if self.use_double_quant and not isinstance(self.scale_dtype, str): - raise ValueError("scale_dtype must be a string") - - if not isinstance(self.group_size, int): - raise ValueError("group_size must be a int") - - if self.scheme not in ["sym"]: - raise ValueError("scheme: {} is not support, only support 'sym' now!".format(self.scheme)) - self.use_neural_speed = False - - def post_init_runtime(self): - r""" - Safety checker that arguments are correct - also replaces some NoneType arguments with their default values. - """ - - if self.llm_int8_skip_modules is not None and not isinstance(self.llm_int8_skip_modules, list): - raise ValueError("llm_int8_skip_modules must be a list of strings") - - # MX-compliant format - # https://arxiv.org/abs/2310.10537 - runtime_supported_compute_dtype = ["fp32", "fp16", "bf16", "int8"] - runtime_supported_weight_dtype = [ - "int4", - "int8", - "fp8", - "fp8_e5m2", - "fp8_e4m3", - "fp4", - "fp4_e2m1", - "nf4", - ] - runtime_supported_scale_dtype = ["fp32", "bf16", "fp8"] - runtime_supported_group_size = [-1, 32, 128] - runtime_supported_scheme = ["sym", "asym"] - - if self.compute_dtype is None: - self.compute_dtype = "fp32" - else: - if self.compute_dtype not in runtime_supported_compute_dtype: - raise ValueError("compute_dtype must be in {}.".format(runtime_supported_compute_dtype)) - - if self.weight_dtype is None: - self.weight_dtype = "int4" - elif self.weight_dtype == "fp8": - self.weight_dtype == "fp8_e4m3" - elif self.weight_dtype == "fp4": - self.weight_dtype = "fp4_e2m1" - else: - if self.weight_dtype not in runtime_supported_weight_dtype: - raise ValueError("weight_dtype must be in {}.".format(runtime_supported_weight_dtype)) - - if self.scale_dtype is None: - self.scale_dtype = "fp32" - else: - if self.scale_dtype not in runtime_supported_scale_dtype: - raise ValueError("scale_dtype must be in {}.".format(runtime_supported_scale_dtype)) - - if self.group_size not in runtime_supported_group_size: - raise ValueError("group_size must be an integer in {}.".format(runtime_supported_group_size)) - - if self.scheme not in runtime_supported_scheme: - raise ValueError("scheme must be in {}.".format(runtime_supported_scheme)) - - if self.weight_dtype[:3] in ["fp8", "fp4", "nf4"]: - if self.compute_dtype in ["int8"]: - print("WARNING: int8 compute dtype is not be supported in float quant types! "\ - "Fall back to fp32.") - self.compute_dtype = "fp32" - if self.scheme in ["asym"]: - print("WARNING: asym alg is not be supported in float quant types! "\ - "Fall back to sym.") - self.scheme = "sym" - if self.scale_dtype in ["fp8"] and self.weight_dtype[:3] not in ["fp8"]: - print("WARNING: fp8 scale is only be supported in fp8 weight type. "\ - "Fall back to fp32.") - self.scale_dtype = "fp32" - if self.weight_dtype[:3] == "fp8" and self.scale_dtype not in ["fp8", "fp32"]: - print("WARNING: fp8 weight type only supports fp8 / fp32 scale now."\ - " Fall back to fp8.") - self.scale_dtype = "fp8" - - self.use_neural_speed = True - - def quantization_method(self): - r""" - This method returns the quantization method used for the model. - """ - # TODO: For training only pass @classmethod def from_dict(cls, config_dict, return_unused_kwargs=False, **kwargs): """ - Instantiates a [`WeightOnlyQuantConfig`] from a Python dictionary of parameters. + Instantiates a [`SparsityConfig`] from a Python dictionary of parameters. Args: config_dict (`Dict[str, Any]`): @@ -297,7 +95,7 @@ def from_dict(cls, config_dict, return_unused_kwargs=False, **kwargs): Additional parameters from which to initialize the configuration object. Returns: - [`WeightOnlyQuantConfig`]: The configuration object instantiated from those parameters. + [`SparsityConfig`]: The configuration object instantiated from those parameters. """ config = cls(**config_dict) @@ -321,7 +119,9 @@ def from_json_file(cls, json_file_path, return_unused_kwargs, **kwargs): config_dict = json.load(f) return cls.from_dict(config_dict, return_unused_kwargs, **kwargs) - def to_json_file(self, json_file_path: Union[str, os.PathLike], use_diff: bool = True): + def to_json_file( + self, json_file_path: Union[str, os.PathLike], use_diff: bool = True + ): """ Save this instance to a JSON file. @@ -329,8 +129,6 @@ def to_json_file(self, json_file_path: Union[str, os.PathLike], use_diff: bool = json_file_path (`str` or `os.PathLike`): Path to the JSON file in which this configuration instance's parameters will be saved. """ - # set tokenizer to None due to it doesn't support write to json - self.tokenizer = None with open(json_file_path, "w", encoding="utf-8") as writer: writer.write(self.to_json_string(use_diff=use_diff)) @@ -346,14 +144,6 @@ def to_dict(self) -> Dict[str, Any]: def __repr__(self): return f"{self.__class__.__name__} {self.to_json_string()}" - def rm_unspport_serial_items(self, config_dict): - unsupport_serial_items = ["calib_func", "calib_dataloader"] - for key in unsupport_serial_items: - if config_dict.get(key) is not None: - del config_dict[key] - - return config_dict - def to_json_string(self, use_diff: bool = True) -> str: """ Serializes this instance to a JSON string. @@ -361,7 +151,7 @@ def to_json_string(self, use_diff: bool = True) -> str: Args: use_diff (`bool`, *optional*, defaults to `True`): If set to `True`, only the difference between the config instance and the default - `WeightOnlyQuantConfig()` + `SparsityConfig()` is serialized to JSON string. Returns: @@ -372,7 +162,6 @@ def to_json_string(self, use_diff: bool = True) -> str: else: config_dict = self.to_dict() - config_dict = self.rm_unspport_serial_items(config_dict) return json.dumps(config_dict, indent=2, sort_keys=True) + "\n" def to_diff_dict(self) -> Dict[str, Any]: @@ -386,7 +175,7 @@ def to_diff_dict(self) -> Dict[str, Any]: config_dict = self.to_dict() # get the default config dict - default_config_dict = WeightOnlyQuantConfig().to_dict() + default_config_dict = SparsityConfig().to_dict() serializable_config_dict = {} @@ -397,7 +186,12 @@ def to_diff_dict(self) -> Dict[str, Any]: return serializable_config_dict - def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs): + def save_pretrained( + self, + save_directory: Union[str, os.PathLike], + push_to_hub: bool = False, + **kwargs, + ): """ Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the [`~PretrainedConfig.from_pretrained`] class method. @@ -415,7 +209,9 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: self._set_token_in_kwargs(kwargs) if os.path.isfile(save_directory): - raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file") + raise AssertionError( + f"Provided path ({save_directory}) should be a directory, not a file" + ) os.makedirs(save_directory, exist_ok=True) @@ -426,7 +222,7 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: files_timestamps = self._get_files_timestamps(save_directory) # If we save using the predefined names, we can load using `from_pretrained` - output_config_file = os.path.join(save_directory, QUANT_CONFIG) + output_config_file = os.path.join(save_directory, SPARSITY_CONFIG) self.to_json_file(output_config_file, use_diff=False) logger.info(f"Configuration saved in {output_config_file}") @@ -441,170 +237,328 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: ) @classmethod - def get_config_dict(cls, pretrained_model_name_or_path: Union[str, os.PathLike], - **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]: - cf = kwargs.pop("_configuration_file", QUANT_CONFIG) - return super().get_config_dict(pretrained_model_name_or_path, _configuration_file=cf, **kwargs) + def get_config_dict( + cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: + return super().get_config_dict( + pretrained_model_name_or_path, _configuration_file=SPARSITY_CONFIG, **kwargs + ) -@dataclass -class MixedPrecisionConfig: - dtype: str = "bfloat16" +from transformers.utils.quantization_config import QuantizationConfigMixin +from enum import Enum -@dataclass -class SmoothQuantConfig: - backend: str = "ipex" - ipex_opt_llm: bool = None - tokenizer: Any = None - calib_func: Any = None - calib_dataset: str = "NeelNanda/pile-10k" - calib_shuffle: bool = True - calib_iters: int = 100 - calib_padding: bool = False - calib_len: int = 512 - calib_pad_val: int = 1 - alpha: float = 0.5 - op_type_dict: dict = None - op_name_dict: dict = None - excluded_precisions: list = field(default_factory=list) - example_inputs: Any = None - num_beams: int = 1 - recipes: dict = field( - default_factory=lambda: { - "smooth_quant": True, - "smooth_quant_args": {"alpha": 0.5}, - } - ) +class QuantizationMethod(str, Enum): + BITS_AND_BYTES = "bitsandbytes" + GPTQ = "gptq" + AWQ = "awq" + AQLM = "aqlm" + RTN = "rtn" + AUTOROUND = "autoround" + TEQ = "teq" -class SparsityConfig(PretrainedConfig): - def __init__( - self, - sparse_pattern: str = "1x1", - sparse_dtype=None, - sparse_layers=None, - dense_layers: list = ["lm_head"], - group_size=None, - **kwargs, - ): - self.sparse_pattern = sparse_pattern - self.sparse_dtype = sparse_dtype - self.sparse_layers = sparse_layers - self.dense_layers = dense_layers - self.group_size = group_size +class ITREXQuantizationConfigMixin(QuantizationConfigMixin): + """ + Mixin class for quantization config + """ - def post_init(self): - r""" - Safety checker that arguments are correct - also replaces some NoneType arguments with their default values. + def update(self, **kwargs): """ - pass - - @classmethod - def from_dict(cls, config_dict, return_unused_kwargs=False, **kwargs): - """ - Instantiates a [`SparsityConfig`] from a Python dictionary of parameters. + Updates attributes of this class instance with attributes from `kwargs` if they match existing atributtes, + returning all the unused kwargs. Args: - config_dict (`Dict[str, Any]`): - Dictionary that will be used to instantiate the configuration object. - return_unused_kwargs (`bool`): - Whether or not to return a list of unused keyword arguments. Used for `from_pretrained` method in - `PreTrainedModel`. kwargs (`Dict[str, Any]`): - Additional parameters from which to initialize the configuration object. + Dictionary of attributes to tentatively update this class. Returns: - [`WeightOnlyQuantConfig`]: The configuration object instantiated from those parameters. + `Dict[str, Any]`: Dictionary containing all the key-value pairs that were not used to update the instance. """ - - config = cls(**config_dict) - to_remove = [] for key, value in kwargs.items(): - if hasattr(config, key): - setattr(config, key, value) + if hasattr(self, key): + setattr(self, key, value) to_remove.append(key) - for key in to_remove: - kwargs.pop(key, None) - if return_unused_kwargs: - return config, kwargs - else: - return config + # Remove all the attributes that were updated, without modifying the input dict + unused_kwargs = {key: value for key, value in kwargs.items() if key not in to_remove} + return unused_kwargs - @classmethod - def from_json_file(cls, json_file_path, return_unused_kwargs, **kwargs): - with open(json_file_path, "r", encoding="utf-8") as f: - config_dict = json.load(f) - return cls.from_dict(config_dict, return_unused_kwargs, **kwargs) - - def to_json_file(self, json_file_path: Union[str, os.PathLike], use_diff: bool = True): + def post_init_cpu(self): + r""" + Safety checker that arguments are correct """ - Save this instance to a JSON file. - Args: - json_file_path (`str` or `os.PathLike`): - Path to the JSON file in which this configuration instance's parameters will be saved. - """ - with open(json_file_path, "w", encoding="utf-8") as writer: - writer.write(self.to_json_string(use_diff=use_diff)) + if self.compute_dtype is not None and self.compute_dtype not in [ + "fp32", + "bf16", + "int8", + ]: + raise ValueError("compute_dtype must be 'fp32', 'bf16', 'int8'.") + elif self.compute_dtype is None: + self.compute_dtype = "fp32" - def to_dict(self) -> Dict[str, Any]: - """ - Serializes this instance to a Python dictionary. Returns: - `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance. - """ + if self.bits is None: + self.bits = 4 + elif self.bits is not None and self.bits not in [4, 8]: + raise ValueError( + f"Only support quantization to [4, 8] bits but found {self.bits}" + ) - output = copy.deepcopy(self.__dict__) - return output + if self.bits == 4 and self.weight_dtype not in [ + "int4_fullrange", + "int4_clip", + "nf4", + "fp4_e2m1_bnb", + "fp4_e2m1", + ]: + self.weight_dtype = "int4_clip" + logger.warning( + "int4_clip weight_type is used due to bits is 4 but weight_dtype is not set." + ) - def __repr__(self): - return f"{self.__class__.__name__} {self.to_json_string()}" + if self.bits == 8 and self.weight_dtype not in ["int8", "fp8_e5m2", "fp8_e4m3"]: + self.weight_dtype = "int8" + logger.warning( + "int8 weight_type is used due to bits is 8 but weight_dtype is not set." + ) - def to_json_string(self, use_diff: bool = True) -> str: + elif self.weight_dtype not in [ + "int8", + "int4_fullrange", + "int4_clip", + "nf4", + "fp4_e2m1_bnb", + "fp4_e2m1", + "fp8_e5m2", + "fp8_e4m3", + ]: + raise ValueError( + f"weight_dtype must be a string in " + f"'int8', 'int4_fullrange', 'int4_clip', 'nf4', 'fp4_e2m1_bnb', 'fp4_e2m1', 'fp8_e5m2, fp8_e4m3'" + ) + + if self.scale_dtype is not None and self.scale_dtype not in [ + "fp32", + "fp8_e8m0", + ]: + raise ValueError( + f"scale_dtype must be a string in 'fp32', 'fp8_e8m0' " + f"and fp8_e8m0 only used for weight_dtype 'fp8_e5m2', 'fp8_e4m3'" + ) + elif self.scale_dtype is None: + self.scale_dtype = "fp32" + + if not isinstance(self.use_double_quant, bool): + raise ValueError("use_double_quant must be a boolean") + + if self.use_double_quant and not isinstance(self.double_quant_dtype, str): + raise ValueError("double_quant_dtype must be a string") + + if self.use_double_quant and not isinstance(self.scale_dtype, str): + raise ValueError("scale_dtype must be a string") + + if not isinstance(self.group_size, int): + raise ValueError("group_size must be a int") + + if not isinstance(self.scheme, str): + raise ValueError("scheme must be a string") + + if self.scheme == "asym" and ( + self.compute_dtype == "int8" + or self.weight_dtype.startswith("fp") + or self.weight_dtype.startswith("nf") + or self.scale_dtype != "fp32" + ): + raise ValueError( + "WeightOnlyQuantization doesn't support asym with \ + compute_dtype int8 or weight_dtype float or scale_dtype non-fp32 now, \ + please use sym scheme" + ) + + self.use_neural_speed = False + + def post_init_xpu(self): + r""" + Safety checker that arguments are correct - also replaces some NoneType arguments with their default values. """ - Serializes this instance to a JSON string. - Args: - use_diff (`bool`, *optional*, defaults to `True`): - If set to `True`, only the difference between the config instance and the default - `WeightOnlyQuantConfig()` - is serialized to JSON string. + if self.compute_dtype is not None and self.compute_dtype not in ["fp16"]: + raise ValueError("compute_dtype must be 'fp16'.") + elif self.compute_dtype is None: + self.compute_dtype = "fp16" - Returns: - `str`: String containing all the attributes that make up this configuration instance in JSON format. + if self.bits is None: + self.bits = 4 + elif self.bits not in [4]: + raise ValueError( + f"Only support quantization to [4] bits but found {self.bits}" + ) + + if self.weight_dtype is None: + self.weight_dtype = "int4_fullrange" + + elif self.weight_dtype not in [ + "int4_fullrange", + ]: + raise ValueError(f"weight_dtype must be a string in 'int4_fullrange', but get {self.weight_dtype}.") + + if self.scale_dtype is not None and self.scale_dtype not in ["fp16"]: + raise ValueError(f"scale_dtype must be a string in 'fp16'") + elif self.scale_dtype is None: + self.scale_dtype = "fp16" + + if not isinstance(self.use_double_quant, bool): + raise ValueError("use_double_quant must be a boolean") + + if self.use_double_quant and not isinstance(self.double_quant_dtype, str): + raise ValueError("double_quant_dtype must be a string") + + if self.use_double_quant and not isinstance(self.scale_dtype, str): + raise ValueError("scale_dtype must be a string") + + if not isinstance(self.group_size, int): + raise ValueError("group_size must be a int") + + if self.scheme not in ["sym"]: + raise ValueError( + "scheme: {} is not support, only support 'sym' now!".format(self.scheme) + ) + self.use_neural_speed = False + + def post_init_runtime(self): + r""" + Safety checker that arguments are correct - also replaces some NoneType arguments with their default values. """ - if use_diff is True: - config_dict = self.to_diff_dict() + + # MX-compliant format + # https://arxiv.org/abs/2310.10537 + runtime_supported_compute_dtype = ["fp32", "fp16", "bf16", "int8"] + runtime_supported_weight_dtype = [ + "int4", + "int8", + "fp8", + "fp8_e5m2", + "fp8_e4m3", + "fp4", + "fp4_e2m1", + "nf4", + ] + runtime_supported_scale_dtype = ["fp32", "bf16", "fp8"] + runtime_supported_group_size = [-1, 32, 128] + runtime_supported_scheme = ["sym", "asym"] + + if self.compute_dtype is None: + self.compute_dtype = "fp32" else: - config_dict = self.to_dict() + if self.compute_dtype not in runtime_supported_compute_dtype: + raise ValueError( + "compute_dtype must be in {}.".format( + runtime_supported_compute_dtype + ) + ) + + if self.bits is None: + self.bits = 4 + elif self.bits not in [4, 8]: + raise ValueError( + f"Only support quantization to [4, 8] bits but found {self.bits}" + ) - return json.dumps(config_dict, indent=2, sort_keys=True) + "\n" + if self.weight_dtype is None: + self.weight_dtype = "int4" + elif self.weight_dtype == "fp8": + self.weight_dtype == "fp8_e4m3" + elif self.weight_dtype == "fp4": + self.weight_dtype = "fp4_e2m1" + else: + if self.weight_dtype not in runtime_supported_weight_dtype: + raise ValueError( + "weight_dtype must be in {}.".format(runtime_supported_weight_dtype) + ) - def to_diff_dict(self) -> Dict[str, Any]: - """ - Removes all attributes from config which correspond to the default config attributes for better readability and - serializes to a Python dictionary. + if self.bits == 4 and self.weight_dtype not in ["int4", "nf4", "fp4_e2m1"]: + self.weight_dtype = "int4" + print( + "int4 weight_type is used due to bits is 4 but weight_dtype is not set." + ) - Returns: - `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, - """ - config_dict = self.to_dict() + if self.bits == 8 and self.weight_dtype not in ["int8", "fp8_e5m2", "fp8_e4m3"]: + self.weight_dtype = "int8" + print( + "int8 weight_type is used due to bits is 8 but weight_dtype is not set." + ) - # get the default config dict - default_config_dict = SparsityConfig().to_dict() + if self.scale_dtype is None: + self.scale_dtype = "fp32" + else: + if self.scale_dtype not in runtime_supported_scale_dtype: + raise ValueError( + "scale_dtype must be in {}.".format(runtime_supported_scale_dtype) + ) - serializable_config_dict = {} + if self.group_size not in runtime_supported_group_size: + raise ValueError( + "group_size must be an integer in {}.".format( + runtime_supported_group_size + ) + ) - # only serialize values that differ from the default config - for key, value in config_dict.items(): - if value != default_config_dict[key]: - serializable_config_dict[key] = value + if self.weight_dtype[:3] in ["fp8", "fp4", "nf4"]: + if self.compute_dtype in ["int8"]: + print( + "WARNING: int8 compute dtype is not be supported in float quant types! " + "Fall back to fp32." + ) + self.compute_dtype = "fp32" + if self.scheme in ["asym"]: + print( + "WARNING: asym alg is not be supported in float quant types! " + "Fall back to sym." + ) + self.scheme = "sym" + if self.scale_dtype in ["fp8"] and self.weight_dtype[:3] not in ["fp8"]: + print( + "WARNING: fp8 scale is only be supported in fp8 weight type. " + "Fall back to fp32." + ) + self.scale_dtype = "fp32" + if self.weight_dtype[:3] == "fp8" and self.scale_dtype not in [ + "fp8", + "fp32", + ]: + print( + "WARNING: fp8 weight type only supports fp8 / fp32 scale now." + " Fall back to fp8." + ) + self.scale_dtype = "fp8" - return serializable_config_dict + self.use_neural_speed = True - def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs): + def to_json_file( + self, json_file_path: Union[str, os.PathLike], use_diff: bool = True + ): + """ + Save this instance to a JSON file. + + Args: + json_file_path (`str` or `os.PathLike`): + Path to the JSON file in which this configuration instance's parameters will be saved. + """ + # set tokenizer to None due to it doesn't support write to json + if hasattr(self, "tokenizer"): + self.tokenizer = None + with open(json_file_path, "w", encoding="utf-8") as writer: + writer.write(self.to_json_string(use_diff=use_diff)) + + def save_pretrained( + self, + save_directory: Union[str, os.PathLike], + push_to_hub: bool = False, + **kwargs, + ): """ Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the [`~PretrainedConfig.from_pretrained`] class method. @@ -619,10 +573,12 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: kwargs (`Dict[str, Any]`, *optional*): Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. """ - self._set_token_in_kwargs(kwargs) + # self._set_token_in_kwargs(kwargs) if os.path.isfile(save_directory): - raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file") + raise AssertionError( + f"Provided path ({save_directory}) should be a directory, not a file" + ) os.makedirs(save_directory, exist_ok=True) @@ -633,7 +589,7 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: files_timestamps = self._get_files_timestamps(save_directory) # If we save using the predefined names, we can load using `from_pretrained` - output_config_file = os.path.join(save_directory, SPARSITY_CONFIG) + output_config_file = os.path.join(save_directory, QUANT_CONFIG) self.to_json_file(output_config_file, use_diff=False) logger.info(f"Configuration saved in {output_config_file}") @@ -651,4 +607,514 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: def get_config_dict( cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs ) -> Tuple[Dict[str, Any], Dict[str, Any]]: - return super().get_config_dict(pretrained_model_name_or_path, _configuration_file=SPARSITY_CONFIG, **kwargs) + cf = kwargs.pop("_configuration_file", QUANT_CONFIG) + return super().get_config_dict( + pretrained_model_name_or_path, _configuration_file=cf, **kwargs + ) + + +class RtnConfig(ITREXQuantizationConfigMixin): + """ + This is a wrapper class about all possible attributes and features that you can play with a model that has been + loaded using `auto-awq` library awq quantization relying on auto_awq backend. + + Args: + bits (`int`, *optional*, defaults to 4): + The number of bits to quantize to. + group_size (`int`, *optional*, defaults to 128): + The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. + zero_point (`bool`, *optional*, defaults to `True`): + Whether to use zero point quantization. + """ + + def __init__( + self, + bits: int = 4, + group_size: int = 32, + compute_dtype: Any = None, + weight_dtype: Any = None, + scale_dtype: Any = None, + mse_range: bool = False, + use_double_quant=False, + double_quant_scale_dtype=None, # reserve for double quant + scheme: str = "sym", + use_ggml: bool = False, + use_quant: bool = True, + use_neural_speed: bool = False, + llm_int8_skip_modules=None, + **kwargs, + ): + self.quant_method = QuantizationMethod.RTN + self.bits = bits + self.mse_range = mse_range + self.compute_dtype = compute_dtype + self.weight_dtype = weight_dtype + self.scale_dtype = scale_dtype + self.group_size = group_size + self.scheme = scheme + self.use_double_quant = use_double_quant + self.double_quant_scale_dtype = double_quant_scale_dtype + self.llm_int8_skip_modules = ( + llm_int8_skip_modules if llm_int8_skip_modules else [] + ) + self.use_ggml = use_ggml + self.use_quant = use_quant + self.use_neural_speed = use_neural_speed + self.device = kwargs.get("device", "auto") + self.calib_dataloader = None + self.calib_dataset = None + self.calib_func = None + self.calib_iters = None + + def to_diff_dict(self) -> Dict[str, Any]: + """ + Removes all attributes from config which correspond to the default config attributes for better readability and + serializes to a Python dictionary. + + Returns: + `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, + """ + config_dict = self.to_dict() + + # get the default config dict + default_config_dict = RtnConfig().to_dict() + + serializable_config_dict = {} + + # only serialize values that differ from the default config + for key, value in config_dict.items(): + if value != default_config_dict[key]: + serializable_config_dict[key] = value + + return serializable_config_dict + +class GPTQConfig(ITREXQuantizationConfigMixin): + """ + This is a wrapper class about all possible attributes and features that you can play with a model that has been + loaded using `intel_extension_for_transformers` api for gptq quantization relying on CPU device. + + Args: + bits (`int`): + The number of bits to quantize to, supported numbers are (2, 3, 4, 8). + tokenizer (`str` or `PreTrainedTokenizerBase`, *optional*): + The tokenizer used to process the dataset. You can pass either: + - A custom tokenizer object. + - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co. + Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a + user or organization name, like `dbmdz/bert-base-german-cased`. + - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved + using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`. + dataset (`Union[List[str]]`, *optional*): + The dataset used for quantization. You can provide your own dataset in a list of string or just use the + original datasets used in GPTQ paper ['wikitext2','c4','c4-new','ptb','ptb-new'] + group_size (`int`, *optional*, defaults to 128): + The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. + damp_percent (`float`, *optional*, defaults to 0.1): + The percent of the average Hessian diagonal to use for dampening. Recommended value is 0.1. + desc_act (`bool`, *optional*, defaults to `False`): + Whether to quantize columns in order of decreasing activation size. Setting it to False can significantly + speed up inference but the perplexity may become slightly worse. Also known as act-order. + sym (`bool`, *optional*, defaults to `True`): + Whether to use symmetric quantization. + max_input_length (`int`, *optional*): + The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input + length. It is specific to the exllama backend with act-order. + + """ + + def __init__( + self, + bits: int = 4, + tokenizer: Any = None, + dataset: Optional[Union[List[str], str]] = None, + group_size: int = 32, + compute_dtype: Any = None, + weight_dtype: Any = None, + scale_dtype: Any = None, + use_double_quant=False, + double_quant_scale_dtype=None, # reserve for double quant + sym: bool = True, + blocksize: int = 128, + damp_percent: float = 0.1, + desc_act: bool = False, + nsamples: int = 128, + max_input_length: Optional[int] = None, + static_groups: bool = False, + use_ggml: bool = False, + use_quant: bool = True, + use_neural_speed: bool = False, + llm_int8_skip_modules=None, + **kwargs, + ): + + from intel_extension_for_transformers.llm.quantization.utils import ( + convert_dtype_torch2str, + ) + + self.quant_method = QuantizationMethod.GPTQ + self.bits = bits + self.tokenizer = tokenizer + self.dataset = dataset + self.compute_dtype = compute_dtype + self.weight_dtype = weight_dtype + self.scale_dtype = scale_dtype + self.sym = sym + self.use_double_quant = use_double_quant + self.double_quant_scale_dtype = double_quant_scale_dtype + self.blocksize = blocksize + self.nsamples = nsamples + self.group_size = group_size + self.damp_percent = damp_percent + self.desc_act = desc_act + self.static_groups = static_groups + self.max_input_length = max_input_length + self.llm_int8_skip_modules = ( + llm_int8_skip_modules if llm_int8_skip_modules else [] + ) + self.use_ggml = use_ggml + self.use_quant = use_quant + self.use_neural_speed = use_neural_speed + self.device = kwargs.get("device", "auto") + self.calib_dataloader = kwargs.get("calib_dataloader", None) + self.calib_dataset = kwargs.get("calib_dataset", "NeelNanda/pile-10k") + self.calib_func = kwargs.get("calib_func", None) + self.calib_iters = kwargs.get("calib_iters", 100) + self.scheme = "sym" if self.sym else "asym" + + if isinstance(compute_dtype, torch.dtype): + self.compute_dtype = convert_dtype_torch2str(compute_dtype) + else: + self.compute_dtype = compute_dtype + + if isinstance(scale_dtype, torch.dtype): + self.scale_dtype = convert_dtype_torch2str(scale_dtype) + else: + self.scale_dtype = scale_dtype + + if isinstance(double_quant_scale_dtype, torch.dtype): + self.double_quant_scale_dtype = convert_dtype_torch2str( + double_quant_scale_dtype + ) + else: + self.double_quant_scale_dtype = double_quant_scale_dtype + self.post_init_gptq() + + def post_init_gptq(self): + r""" + Safety checker that arguments are correct + """ + + if self.bits not in [4, 8]: + raise ValueError( + f"Only support quantization to [4, 8] bits but found {self.bits}" + ) + + if not (0 < self.damp_percent < 1): + raise ValueError("damp_percent must between 0 and 1.") + + def to_diff_dict(self) -> Dict[str, Any]: + """ + Removes all attributes from config which correspond to the default config attributes for better readability and + serializes to a Python dictionary. + + Returns: + `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, + """ + config_dict = self.to_dict() + + # get the default config dict + default_config_dict = GPTQConfig().to_dict() + + serializable_config_dict = {} + + # only serialize values that differ from the default config + for key, value in config_dict.items(): + if value != default_config_dict[key]: + serializable_config_dict[key] = value + + return serializable_config_dict + +class AwqConfig(ITREXQuantizationConfigMixin): + """ + This is a wrapper class about all possible attributes and features that you can play with a model that has been + loaded using `auto-awq` library awq quantization relying on auto_awq backend. + + Args: + bits (`int`, *optional*, defaults to 4): + The number of bits to quantize to. + group_size (`int`, *optional*, defaults to 128): + The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. + zero_point (`bool`, *optional*, defaults to `True`): + Whether to use zero point quantization. + """ + + def __init__( + self, + bits: int = 8, + tokenizer: Any = None, + dataset: Optional[Union[List[str], str]] = None, + group_size: int = 32, + compute_dtype: Any = None, + weight_dtype: Any = None, + scale_dtype: Any = None, + use_double_quant=False, + double_quant_scale_dtype=None, # reserve for double quant + zero_point: bool = True, + mse_range: bool = False, + use_ggml: bool = False, + use_quant: bool = True, + use_neural_speed: bool = False, + llm_int8_skip_modules=None, + **kwargs, + ): + self.quant_method = QuantizationMethod.AWQ + self.bits = bits + self.tokenizer = tokenizer + self.dataset = dataset + self.compute_dtype = compute_dtype + self.weight_dtype = weight_dtype + self.scale_dtype = scale_dtype + self.group_size = group_size + self.zero_point = zero_point + self.mse_range = mse_range + self.use_double_quant = use_double_quant + self.double_quant_scale_dtype = double_quant_scale_dtype + self.llm_int8_skip_modules = ( + llm_int8_skip_modules if llm_int8_skip_modules else [] + ) + self.use_ggml = use_ggml + self.use_quant = use_quant + self.use_neural_speed = use_neural_speed + self.device = kwargs.get("device", "auto") + self.calib_dataloader = kwargs.get("calib_dataloader", None) + self.calib_dataset = kwargs.get("calib_dataset", "NeelNanda/pile-10k") + self.calib_func = kwargs.get("calib_func", None) + self.calib_iters = kwargs.get("calib_iters", 100) + self.scheme = "asym" if self.zero_point else "sym" + + def to_diff_dict(self) -> Dict[str, Any]: + """ + Removes all attributes from config which correspond to the default config attributes for better readability and + serializes to a Python dictionary. + + Returns: + `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, + """ + config_dict = self.to_dict() + + # get the default config dict + default_config_dict = AwqConfig().to_dict() + + serializable_config_dict = {} + + # only serialize values that differ from the default config + for key, value in config_dict.items(): + if value != default_config_dict[key]: + serializable_config_dict[key] = value + + return serializable_config_dict + +class TeqConfig(ITREXQuantizationConfigMixin): + """ + This is a wrapper class about all possible attributes and features that you can play with a model that has been + loaded using `auto-awq` library awq quantization relying on auto_awq backend. + + Args: + bits (`int`, *optional*, defaults to 4): + The number of bits to quantize to. + group_size (`int`, *optional*, defaults to 128): + The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. + zero_point (`bool`, *optional*, defaults to `True`): + Whether to use zero point quantization. + """ + + def __init__( + self, + bits: int = 8, + tokenizer: Any = None, + dataset: Optional[Union[List[str], str]] = None, + group_size: int = 32, + compute_dtype: Any = None, + weight_dtype: Any = None, + scale_dtype: Any = None, + use_double_quant=False, + double_quant_scale_dtype=None, # reserve for double quant + scheme: str = "sym", + use_ggml: bool = False, + use_neural_speed: bool = False, + llm_int8_skip_modules=None, + **kwargs, + ): + self.quant_method = QuantizationMethod.TEQ + self.bits = bits + self.tokenizer = tokenizer + self.dataset = dataset + self.compute_dtype = compute_dtype + self.weight_dtype = weight_dtype + self.scale_dtype = scale_dtype + self.group_size = group_size + self.scheme = scheme + self.use_double_quant = use_double_quant + self.double_quant_scale_dtype = double_quant_scale_dtype + self.llm_int8_skip_modules = ( + llm_int8_skip_modules if llm_int8_skip_modules else [] + ) + self.use_ggml = use_ggml + self.use_neural_speed = use_neural_speed + self.device = kwargs.get("device", "auto") + self.calib_dataloader = kwargs.get("calib_dataloader", None) + self.calib_dataset = kwargs.get("calib_dataset", "NeelNanda/pile-10k") + self.calib_func = kwargs.get("calib_func", None) + self.calib_iters = kwargs.get("calib_iters", 100) + + def to_diff_dict(self) -> Dict[str, Any]: + """ + Removes all attributes from config which correspond to the default config attributes for better readability and + serializes to a Python dictionary. + + Returns: + `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, + """ + config_dict = self.to_dict() + + # get the default config dict + default_config_dict = TeqConfig().to_dict() + + serializable_config_dict = {} + + # only serialize values that differ from the default config + for key, value in config_dict.items(): + if value != default_config_dict[key]: + serializable_config_dict[key] = value + + return serializable_config_dict + +class AutoRoundConfig(ITREXQuantizationConfigMixin): + """ + This is a wrapper class about all possible attributes and features that you can play with a model that has been + loaded using `intel_extension_for_transformers` api for gptq quantization relying on CPU device. + + Args: + bits (`int`): + The number of bits to quantize to, supported numbers are (2, 3, 4, 8). + tokenizer (`str` or `PreTrainedTokenizerBase`, *optional*): + The tokenizer used to process the dataset. You can pass either: + - A custom tokenizer object. + - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co. + Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a + user or organization name, like `dbmdz/bert-base-german-cased`. + - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved + using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`. + dataset (`Union[List[str]]`, *optional*): + The dataset used for quantization. You can provide your own dataset in a list of string or just use the + original datasets used in GPTQ paper ['wikitext2','c4','c4-new','ptb','ptb-new'] + group_size (`int`, *optional*, defaults to 128): + The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. + damp_percent (`float`, *optional*, defaults to 0.1): + The percent of the average Hessian diagonal to use for dampening. Recommended value is 0.1. + desc_act (`bool`, *optional*, defaults to `False`): + Whether to quantize columns in order of decreasing activation size. Setting it to False can significantly + speed up inference but the perplexity may become slightly worse. Also known as act-order. + sym (`bool`, *optional*, defaults to `True`): + Whether to use symmetric quantization. + max_input_length (`int`, *optional*): + The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input + length. It is specific to the exllama backend with act-order. + + """ + + def __init__( + self, + bits: int = 8, + dtype: str = "int", + tokenizer: Any = None, + dataset: Optional[Union[List[str], str]] = None, + group_size: int = 32, + compute_dtype: Any = None, + weight_dtype: Any = None, + scale_dtype: Any = None, + use_double_quant=False, + double_quant_scale_dtype=None, # reserve for double quant + sym: bool = True, + lr: float = 0.0025, + minmax_lr: float = 0.0025, + use_quant_input: bool = True, + nsamples: int = 128, + iters: int = 200, + static_groups: bool = False, + use_ggml: bool = False, + use_neural_speed: bool = False, + llm_int8_skip_modules=None, + **kwargs, + ): + + from intel_extension_for_transformers.llm.quantization.utils import ( + convert_dtype_torch2str, + ) + + self.quant_method = QuantizationMethod.AUTOROUND + self.bits = bits + self.tokenizer = tokenizer + self.dataset = dataset + self.compute_dtype = compute_dtype + self.weight_dtype = weight_dtype + self.scale_dtype = scale_dtype + self.sym = sym + self.use_double_quant = use_double_quant + self.double_quant_scale_dtype = double_quant_scale_dtype + self.nsamples = nsamples + self.group_size = group_size + self.lr = lr + self.minmax_lr = minmax_lr + self.use_quant_input = use_quant_input + self.iters = iters + self.llm_int8_skip_modules = ( + llm_int8_skip_modules if llm_int8_skip_modules else [] + ) + self.use_ggml = use_ggml + self.use_neural_speed = use_neural_speed + self.device = kwargs.get("device", "auto") + self.calib_dataloader = kwargs.get("calib_dataloader", None) + self.calib_dataset = kwargs.get("calib_dataset", "NeelNanda/pile-10k") + self.calib_len = kwargs.get("calib_len", None) + self.calib_func = kwargs.get("calib_func", None) + self.calib_iters = kwargs.get("calib_iters", 100) + self.scheme = "sym" if self.sym else "asym" + if isinstance(compute_dtype, torch.dtype): + self.compute_dtype = convert_dtype_torch2str(compute_dtype) + else: + self.compute_dtype = compute_dtype + + if isinstance(scale_dtype, torch.dtype): + self.scale_dtype = convert_dtype_torch2str(scale_dtype) + else: + self.scale_dtype = scale_dtype + + if isinstance(double_quant_scale_dtype, torch.dtype): + self.double_quant_scale_dtype = convert_dtype_torch2str( + double_quant_scale_dtype + ) + else: + self.double_quant_scale_dtype = double_quant_scale_dtype + + def to_diff_dict(self) -> Dict[str, Any]: + """ + Removes all attributes from config which correspond to the default config attributes for better readability and + serializes to a Python dictionary. + + Returns: + `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, + """ + config_dict = self.to_dict() + + # get the default config dict + default_config_dict = AutoRoundConfig().to_dict() + + serializable_config_dict = {} + + # only serialize values that differ from the default config + for key, value in config_dict.items(): + if value != default_config_dict[key]: + serializable_config_dict[key] = value + + return serializable_config_dict diff --git a/intel_extension_for_transformers/transformers/utils/utility.py b/intel_extension_for_transformers/transformers/utils/utility.py index ef27afc2c06..2551abf41f2 100644 --- a/intel_extension_for_transformers/transformers/utils/utility.py +++ b/intel_extension_for_transformers/transformers/utils/utility.py @@ -32,7 +32,7 @@ DECODER_WITH_PAST_NAME = "decoder_with_past_model.bin" WEIGHTS_NAME = "pytorch_model.bin" WEIGHTS_INDEX_NAME = "pytorch_model.bin.index.json" -QUANT_CONFIG = "quantization_config.json" +QUANT_CONFIG = "quantize_config.json" SPARSITY_CONFIG = "sparsity_config.json" SAFE_WEIGHTS_NAME = "model.safetensors" SAFE_WEIGHTS_INDEX_NAME = "model.safetensors.index.json" diff --git a/tests/CI/test_quantization.py b/tests/CI/test_quantization.py index cc416ab94d3..315f4499397 100644 --- a/tests/CI/test_quantization.py +++ b/tests/CI/test_quantization.py @@ -316,7 +316,11 @@ def test_quantization_for_llm(self): from intel_extension_for_transformers.transformers import ( MixedPrecisionConfig, SmoothQuantConfig, - WeightOnlyQuantConfig, + RtnConfig, + AwqConfig, + TeqConfig, + GPTQConfig, + AutoRoundConfig, BitsAndBytesConfig ) from intel_extension_for_transformers.transformers import AutoModelForCausalLM @@ -354,7 +358,7 @@ def test_quantization_for_llm(self): # weight-only # RTN - woq_config = WeightOnlyQuantConfig(weight_dtype="int4_fullrange") + woq_config = RtnConfig(bits=4, weight_dtype="int4_fullrange") woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, quantization_config=woq_config, use_neural_speed=False @@ -364,24 +368,26 @@ def test_quantization_for_llm(self): self.assertTrue(isclose(float(output[0][0][0][0]), 0.16387596726417542, rel_tol=1e-04)) # AWQ - woq_config = WeightOnlyQuantConfig(weight_dtype="int4_fullrange", - calib_iters=5, - tokenizer=tokenizer, - algorithm="AWQ") + woq_config = AwqConfig(bits=4, + weight_dtype="int4_fullrange", + zero_point=False, + calib_iters=5, + tokenizer=tokenizer + ) + woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, quantization_config=woq_config, use_neural_speed=False ) woq_model.eval() output = woq_model(dummy_input) - print("output:", float(output[0][0][0][0])) - self.assertTrue(isclose(float(output[0][0][0][0]), 0.17239853739738464, rel_tol=1e-04)) + self.assertTrue(isclose(float(output[0][0][0][0]), 0.1714431792497635, rel_tol=1e-04)) # TEQ - woq_config = WeightOnlyQuantConfig(weight_dtype="int4_fullrange", + woq_config = TeqConfig(bits=4, weight_dtype="int4_fullrange", calib_iters=5, tokenizer=tokenizer, - algorithm="TEQ") + ) woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, quantization_config=woq_config, use_neural_speed=False @@ -390,7 +396,7 @@ def test_quantization_for_llm(self): output = woq_model(dummy_input) # fp8 - woq_config = WeightOnlyQuantConfig(weight_dtype="fp8_e5m2", scale_dtype="fp8_e8m0") + woq_config = RtnConfig(bits=8, weight_dtype="fp8_e5m2", scale_dtype="fp8_e8m0") woq_model = AutoModelForCausalLM.from_pretrained( model_name_or_path, quantization_config=woq_config, use_neural_speed=False ) @@ -410,12 +416,6 @@ def test_quantization_for_llm(self): output = amp_model(dummy_input) self.assertTrue(isclose(float(output[0][0][0][0]), 0.1689453125, rel_tol=1e-04)) - # bitsandbytes, for cpu is fp32 model - bab_config = BitsAndBytesConfig() - bab_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - quantization_config=bab_config, - use_neural_speed=False - ) # load_in_4bit bit4_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, load_in_4bit=True, @@ -423,7 +423,6 @@ def test_quantization_for_llm(self): ) bit4_model.eval() output = bit4_model(dummy_input) - print("output:", float(output[0][0][0][0])) self.assertTrue(isclose(float(output[0][0][0][0]), 0.18726778030395508, rel_tol=1e-04)) # load_in_8bit @@ -434,22 +433,19 @@ def test_quantization_for_llm(self): ) bit8_model.eval() output = bit8_model(dummy_input) - print("output:", float(output[0][0][0][0])) self.assertTrue(isclose(float(output[0][0][0][0]), 0.1675747185945511, rel_tol=1e-04)) # GPTQ - algorithm_args = { - "act_order": False, - "percdamp": 0.01, - "block_size": 32 , - "nsamples": 3, - "use_max_length": True, - "pad_max_length": 256, - } - woq_config = WeightOnlyQuantConfig(weight_dtype="int4_clip", - algorithm_args=algorithm_args, - tokenizer=tokenizer, - algorithm="GPTQ") + woq_config = GPTQConfig(bits=4, + weight_dtype="int4_clip", + sym=True, + desc_act=False, + damp_percent=0.01, + blocksize=32, + nsamples=3, + max_input_length=256, + tokenizer=tokenizer, + ) woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, quantization_config=woq_config, use_neural_speed=False @@ -459,16 +455,13 @@ def test_quantization_for_llm(self): self.assertTrue(isclose(float(output[0][0][0][0]), 0.17126554250717163, rel_tol=1e-04)) # AUTOROUND - algorithm_args = { - "n_samples": 128, - "seq_len": 32, - "iters": 5, - "scale_dtype": "fp32", - } - woq_config = WeightOnlyQuantConfig(weight_dtype="int4_clip", - algorithm_args=algorithm_args, - tokenizer=tokenizer, - algorithm="AUTOROUND") + woq_config = AutoRoundConfig(bits=4, + weight_dtype="int4_clip", + nsamples=128, + calib_len=32, + calib_iters=5, + tokenizer=tokenizer + ) woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, quantization_config=woq_config, use_neural_speed=False diff --git a/tests/CI/test_weight_only.py b/tests/CI/test_weight_only.py index 63ca919f250..9f3f6f9970f 100644 --- a/tests/CI/test_weight_only.py +++ b/tests/CI/test_weight_only.py @@ -32,7 +32,7 @@ from intel_extension_for_transformers.llm.quantization.nn.modules import QuantizedLinearQBits, QuantizedLoraLinearQBits from intel_extension_for_transformers.llm.quantization.utils import convert_to_quantized_model, replace_linear from intel_extension_for_transformers.llm.utils.generation import _beam_search, _greedy_search -from intel_extension_for_transformers.transformers import WeightOnlyQuantConfig +from intel_extension_for_transformers.transformers import RtnConfig class DummyDataset(data.Dataset): @@ -77,7 +77,7 @@ def tearDownClass(cls) -> None: shutil.rmtree('tmp', ignore_errors=True) def test_woq_config(self): - config = WeightOnlyQuantConfig(weight_dtype="int4_fullrange", group_size=32) + config = RtnConfig(bits=4, weight_dtype="int4_fullrange", group_size=32) diff_res = config.to_diff_dict() ref_config = {'weight_dtype': 'int4_fullrange'} self.assertEqual(diff_res, ref_config) @@ -88,7 +88,7 @@ def test_woq_config(self): print(config) def test_woq_config_post_init_runtime(self): - config = WeightOnlyQuantConfig(weight_dtype="fp4", compute_dtype="int8", scheme="asym", scale_dtype="fp8") + config = RtnConfig(bits=4, weight_dtype="fp4", compute_dtype="int8", scheme="asym", scale_dtype="fp8") config.post_init_runtime() config_dict = config.to_dict() self.assertEqual(config_dict["weight_dtype"], "fp4_e2m1") @@ -110,8 +110,8 @@ def test_int8(self): activation = torch.rand(1, 32, dtype=torch.float) output = model(activation) - config = WeightOnlyQuantConfig(weight_dtype="int8", group_size=32) - config.post_init() + config = RtnConfig(bits=8, weight_dtype="int8", group_size=32) + config.post_init_cpu() convert_to_quantized_model(model, config) output_quant = model(activation) print(output) @@ -132,8 +132,8 @@ def test_int4(self): with torch.no_grad(): model.linear.weight = torch.nn.Parameter(raw_wei) - config = WeightOnlyQuantConfig(weight_dtype="int4_fullrange", group_size=32) - config.post_init() + config = RtnConfig(weight_dtype="int4_fullrange", group_size=32) + config.post_init_cpu() convert_to_quantized_model(model, config) output_quant = model(activation) print(output) @@ -165,7 +165,7 @@ def test_auto_model(self): self.assertTrue(len(output) == 2 and isinstance(output[1], list)) def test_auto_model_with_config(self): - config = WeightOnlyQuantConfig() + config = RtnConfig() model = AutoModelForCausalLM.from_pretrained(llama_model_path, quantization_config=config, use_neural_speed=False) diff --git a/tests/CI/test_weight_only_gpu.py b/tests/CI/test_weight_only_gpu.py index 12b29370ce1..af96f5edc31 100644 --- a/tests/CI/test_weight_only_gpu.py +++ b/tests/CI/test_weight_only_gpu.py @@ -21,7 +21,7 @@ import torch.utils.data as data from intel_extension_for_transformers.llm.quantization.utils import convert_to_quantized_model from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM -from intel_extension_for_transformers.transformers import WeightOnlyQuantConfig +from intel_extension_for_transformers.transformers import GPTQConfig, RtnConfig from math import isclose from transformers import AutoTokenizer from intel_extension_for_transformers.utils.utils import get_gpu_family, _ipex_available @@ -104,16 +104,12 @@ def test_int4_ipex_arc_with_auto(self): fp16_out = output.to("cpu") print("fp16 logits {}".format(fp16_out.shape)) - config = WeightOnlyQuantConfig(weight_dtype="int4_fullrange", - group_size=32, - compute_dtype="fp16", - scale_dtype="fp16") - config.calib_dataloader = DataLoader( - DummyDataset(MODEL_NAME, model.seqlen), - batch_size=1, - shuffle=False, - ) - qmodel = AutoModelForCausalLM.from_pretrained(model_name, use_neural_speed=False, + config = RtnConfig( + weight_dtype="int4_fullrange", + group_size=32, + compute_dtype="fp16", + scale_dtype="fp16") + qmodel = AutoModelForCausalLM.from_pretrained(MODEL_NAME, use_neural_speed=False, device_map=device_map, quantization_config=config, trust_remote_code=True, torch_dtype=torch.float16) qmodel.save_pretrained(self.workspace) @@ -141,18 +137,13 @@ def test_int4_gptq(self): tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) prompt = "how to test the code?" input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device=device_map) - algorithm_args = { - "act_order": False, - "percdamp": 0.01, - "block_size": 32 , - "nsamples": 3, - "use_max_length": True, - "pad_max_length": 256, - } - woq_config = WeightOnlyQuantConfig( - algorithm_args=algorithm_args, - tokenizer=tokenizer, - algorithm="GPTQ") + woq_config = GPTQConfig( + desc_act=False, + damp_percent=0.01, + block_size=32, + nsamples=3, + max_input_length=256, + tokenizer=tokenizer) woq_model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, quantization_config=woq_config, @@ -163,7 +154,7 @@ def test_int4_gptq(self): ) woq_model.config.architectures = ["GPTJForCausalLM"] woq_model = ipex.optimize_transformers( - woq_model, device=device_map, inplace=True, woq=True, dtype=torch.float16) + woq_model, device=device_map, inplace=True, quantization_config=woq_config, dtype=torch.float16) with torch.inference_mode(), torch.no_grad(), torch.autocast( device_type=device_map, enabled=True, diff --git a/tests/Nightly/test_llm_runtime.py b/tests/Nightly/test_llm_runtime.py index 96f86c7df8b..82bc2860fc1 100644 --- a/tests/Nightly/test_llm_runtime.py +++ b/tests/Nightly/test_llm_runtime.py @@ -21,7 +21,7 @@ import unittest from transformers import AutoTokenizer -from intel_extension_for_transformers.transformers import AutoModel, WeightOnlyQuantConfig +from intel_extension_for_transformers.transformers import AutoModel, RtnConfig def cmpData(numa, numb): totalErr = ((np.abs(numa - numb))**2).sum() @@ -53,7 +53,7 @@ def test_llm_runtime(self): print(tokenizer.decode(pt_generate_ids)) # check output ids - woq_config = WeightOnlyQuantConfig(use_quant=False) + woq_config = RtnConfig(use_quant=False) itrex_model = AutoModel.from_pretrained(model_name, quantization_config=woq_config, use_neural_speed=True, trust_remote_code=True) itrex_generate_ids = itrex_model.generate(inputs.input_ids, do_sample=False, max_new_tokens=100)[0] print(tokenizer.decode(itrex_generate_ids))