From 56f8064fc4c619261aebf1bfde4f182d09a7fa64 Mon Sep 17 00:00:00 2001 From: Xin He Date: Thu, 31 Aug 2023 16:21:04 +0800 Subject: [PATCH 1/4] refine example for SQ and WOQ Signed-off-by: Xin He --- examples/.config/pytorch_optimize.json | 81 ++++++++++- .../language-modeling/quantization/README.md | 15 ++- .../quantization/run_benchmark.sh | 12 +- .../quantization/run_clm_no_trainer.py | 127 ++++++++---------- .../quantization/run_tuning.sh | 25 +++- 5 files changed, 177 insertions(+), 83 deletions(-) diff --git a/examples/.config/pytorch_optimize.json b/examples/.config/pytorch_optimize.json index f7e34838053..4ee6ce1ef1d 100644 --- a/examples/.config/pytorch_optimize.json +++ b/examples/.config/pytorch_optimize.json @@ -1592,6 +1592,30 @@ } } }, + "chatglm_clm_weight_only": { + "working_dir": "huggingface/pytorch/language-modeling/quantization", + "tune": { + "cmd": "bash run_tuning.sh", + "params": { + "topology": "chatglm_weight_only", + "task": "clm", + "approach": "weight_only", + "output_model": "saved_results" + } + }, + "benchmark": { + "cmd": "bash run_benchmark.sh", + "params": { + "topology": "chatglm_weight_only", + "task": "clm", + "mode": "accuracy", + "batch_size": "112", + "config": "saved_results", + "iters": "100", + "int8": "false" + } + } + }, "opt_125m_clm_weight_only": { "working_dir": "huggingface/pytorch/language-modeling/quantization", "tune": { @@ -1640,12 +1664,12 @@ } } }, - "chatglm_clm_weight_only": { + "opt_125m_clm_weight_only_gptq": { "working_dir": "huggingface/pytorch/language-modeling/quantization", "tune": { "cmd": "bash run_tuning.sh", "params": { - "topology": "chatglm_weight_only", + "topology": "opt_125m_weight_only_gptq", "task": "clm", "approach": "weight_only", "output_model": "saved_results" @@ -1654,7 +1678,31 @@ "benchmark": { "cmd": "bash run_benchmark.sh", "params": { - "topology": "chatglm_weight_only", + "topology": "opt_125m_weight_only_gptq", + "task": "clm", + "mode": "accuracy", + "batch_size": "112", + "config": "saved_results", + "iters": "100", + "int8": "false" + } + } + }, + "opt_125m_clm_weight_only_teq": { + "working_dir": "huggingface/pytorch/language-modeling/quantization", + "tune": { + "cmd": "bash run_tuning.sh", + "params": { + "topology": "opt_125m_weight_only_teq", + "task": "clm", + "approach": "weight_only", + "output_model": "saved_results" + } + }, + "benchmark": { + "cmd": "bash run_benchmark.sh", + "params": { + "topology": "opt_125m_weight_only_teq", "task": "clm", "mode": "accuracy", "batch_size": "112", @@ -1664,6 +1712,33 @@ } } }, + "opt_125m_clm_ipex": { + "working_dir": "huggingface/pytorch/language-modeling/quantization", + "tune": { + "cmd": "bash run_tuning.sh", + "params": { + "topology": "opt_125m", + "task": "clm", + "approach": "static", + "backend": "ipex", + "output_model": "saved_results" + } + }, + "benchmark": { + "cmd": "bash run_benchmark.sh", + "params": { + "topology": "opt_125m", + "task": "clm", + "approach": "static", + "backend": "ipex", + "mode": "accuracy", + "batch_size": "112", + "iters": "100", + "int8": "false", + "config": "saved_results" + } + } + }, "opt_1.3b_clm_ipex": { "working_dir": "huggingface/pytorch/language-modeling/quantization", "tune": { diff --git a/examples/huggingface/pytorch/language-modeling/quantization/README.md b/examples/huggingface/pytorch/language-modeling/quantization/README.md index a016b29077a..52629e0afc5 100644 --- a/examples/huggingface/pytorch/language-modeling/quantization/README.md +++ b/examples/huggingface/pytorch/language-modeling/quantization/README.md @@ -48,9 +48,16 @@ python run_clm_no_trainer.py \ --model EleutherAI/gpt-j-6B \ --quantize \ --approach weight_only \ - --output_dir "saved_results" \ + --weight_only_bits 4 \ + --weight_only_group 128 \ + --weight_only_scheme asym \ + --weight_only_algo RTN \ + --weight_only_mse_range \ + --output_dir "saved_results" ``` -**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN/AWQ[1]/GPTQ[2] algorithms. You can try it with `--approach weight_only`. `--awq` will trigger AWQ algorithm. `--gptq` will trigger GPTQ algorithm. For example, to run a GPTQ example, try the following command. +**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md) + + ```bash python run_clm_no_trainer.py \ --model EleutherAI/gpt-j-6B \ @@ -242,5 +249,5 @@ python run_mlm.py \ --overwrite_output_dir ``` -[1]. Lin, Ji, et al. "AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration." arXiv preprint arXiv:2306.00978 (2023). -[2]. Elias, Frantar, et al. "GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers." arXiv preprint arXiv:2210.17323 (2023). +[1]. Elias, Frantar, et al. "GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers." arXiv preprint arXiv:2210.17323 (2023). +[2]. Lin, Ji, et al. "AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration." arXiv preprint arXiv:2306.00978 (2023). diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_benchmark.sh b/examples/huggingface/pytorch/language-modeling/quantization/run_benchmark.sh index e2702f0f0cd..9f9116ab3f9 100644 --- a/examples/huggingface/pytorch/language-modeling/quantization/run_benchmark.sh +++ b/examples/huggingface/pytorch/language-modeling/quantization/run_benchmark.sh @@ -107,16 +107,20 @@ function run_benchmark { elif [ "${topology}" = "falcon_7b_instruct" ]; then script="run_clm_no_trainer.py" model_name_or_path="tiiuae/falcon-7b-instruct" - elif [ "${topology}" = "opt_125m_weight_only" ]; then + elif [ "${topology}" = "opt_125m_weight_only" -o \ + "${topology}" = "opt_125m_weight_only_awq" -o \ + "${topology}" = "opt_125m_weight_only_gptq" -o \ + "${topology}" = "opt_125m_weight_only_teq" ]; then script="run_clm_no_trainer.py" model_name_or_path="facebook/opt-125m" lm_eval_tasks="lambada_openai" extra_cmd=$extra_cmd" --approach weight_only" - elif [ "${topology}" = "opt_125m_weight_only_awq" ]; then + elif [ "${topology}" = "opt_125m" ]; then script="run_clm_no_trainer.py" model_name_or_path="facebook/opt-125m" - lm_eval_tasks="lambada_openai" - extra_cmd=$extra_cmd" --approach weight_only" + if [ "${backend}" = "ipex" ]; then + extra_cmd=$extra_cmd" --ipex" + fi elif [ "${topology}" = "opt_1.3b" ]; then script="run_clm_no_trainer.py" model_name_or_path="facebook/opt-1.3b" diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py b/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py index bd0a96a1292..7e8cf77abfd 100644 --- a/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py +++ b/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py @@ -31,19 +31,6 @@ ) parser.add_argument("--approach", type=str, default='static', help="Select from ['dynamic', 'static', 'weight-only']") -parser.add_argument("--sq", action="store_true") -parser.add_argument("--alpha", default="auto", - help="Smooth quant parameter.") -# ============gptq configs=============== -parser.add_argument("--gptq_actorder", action="store_true", help="Whether to apply the activation order GPTQ heuristic.") -parser.add_argument('--gptq_percdamp', type=float, default=.01, help='Percent of the average Hessian diagonal to use for dampening.') -parser.add_argument('--gptq_block_size', type=int, default=128, help='Block size. sub weight matrix size to run GPTQ.') -parser.add_argument('--gptq_nsamples', type=int, default=128, help='Number of calibration data samples.') -parser.add_argument('--gptq_use_max_length', action="store_true", help='Set all sequence length to be same length of args.gptq_pad_max_length') -parser.add_argument('--gptq_pad_max_length', type=int, default=2048, help='Calibration dataset sequence max length, this should align with your model config, and your dataset builder args: args.pad_max_length') -# ======================================= -parser.add_argument("--weight_only_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ'], - help="Weight-only parameter.") parser.add_argument("--int8", action="store_true") parser.add_argument("--ipex", action="store_true", help="Use intel extension for pytorch.") parser.add_argument("--accuracy", action="store_true") @@ -58,11 +45,28 @@ parser.add_argument("--tasks", nargs='+', default=["lambada_openai", "hellaswag","winogrande","piqa","wikitext"], type=str, help="tasks list for accuracy validation") +parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model") +# ============SmoothQuant configs============== +parser.add_argument("--sq", action="store_true") +parser.add_argument("--alpha", default="auto", help="Smooth quant parameter.") +# ============WeightOnly configs=============== +parser.add_argument("--weight_only_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ'], + help="Weight-only parameter.") parser.add_argument("--weight_only_bits", type=int, default=8) parser.add_argument("--weight_only_group", type=int, default=-1) parser.add_argument("--weight_only_scheme", default="sym") +parser.add_argument("--weight_only_mse_range", action="store_true") parser.add_argument("--weight_only_sym_full_range", action="store_true") -parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model") +# =============GPTQ configs==================== +parser.add_argument("--gptq_actorder", action="store_true", help="Whether to apply the activation order GPTQ heuristic.") +parser.add_argument('--gptq_percdamp', type=float, default=.01, help='Percent of the average Hessian diagonal to use for dampening.') +parser.add_argument('--gptq_block_size', type=int, default=128, help='Block size. sub weight matrix size to run GPTQ.') +parser.add_argument('--gptq_nsamples', type=int, default=128, help='Number of calibration data samples.') +parser.add_argument('--gptq_use_max_length', action="store_true", help='Set all sequence length to be same length of args.gptq_pad_max_length') +parser.add_argument('--gptq_pad_max_length', type=int, default=2048, help='Calibration dataset sequence max length, \ + this should align with your model config, \ + and your dataset builder args: args.pad_max_length') +# ======================================= args = parser.parse_args() if args.ipex: @@ -231,7 +235,9 @@ def calib_func(prepared_model): prepared_model(calib_input[0]) recipes = {} + eval_func = None from neural_compressor import PostTrainingQuantConfig, quantization + # specify the op_type_dict and op_name_dict if args.approach == 'weight_only': op_type_dict = { '.*':{ # re.match @@ -243,55 +249,41 @@ def calib_func(prepared_model): }, }, } - if args.weight_only_sym_full_range: - recipes.update({"rtn_args": {"sym_full_range": True}}) - else: - if re.search("gpt", user_model.config.model_type): - op_type_dict = { - "add": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}}, - } - else: - op_type_dict = {} - excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"] - if args.sq: - # alpha can be a float number of a list of float number. - args.alpha = args.alpha if args.alpha == "auto" else eval(args.alpha) - if re.search("falcon", user_model.config.model_type): - recipes = {"smooth_quant": True, "smooth_quant_args": {'alpha': args.alpha, 'folding': False}} - else: - recipes = {"smooth_quant": True, "smooth_quant_args": {'alpha': args.alpha}} - conf = PostTrainingQuantConfig( - backend="ipex" if args.ipex else "default", - approach=args.approach, - excluded_precisions=excluded_precisions, - op_type_dict=op_type_dict, - recipes=recipes, - ) - elif args.weight_only_algo == "GPTQ": - recipes = { - 'gptq_args': { - 'percdamp': args.gptq_percdamp, - 'act_order':args.gptq_actorder, - 'block_size': args.gptq_block_size, - 'nsamples': args.gptq_nsamples, - 'use_max_length': args.gptq_use_max_length - } + op_name_dict={ + 'lm_head':{"weight": {'dtype': 'fp32'},}, + 'embed_out':{"weight": {'dtype': 'fp32'},}, # for dolly_v2 } + recipes["rtn_args"] = { + "mse_range": args.weight_only_mse_range, + "sym_full_range": args.weight_only_sym_full_range, + } + # GPTQ: use assistive functions to modify calib_dataloader and calib_func + # TEQ: set calib_func=None, use default training func as calib_func + if args.weight_only_algo in ["GPTQ", "TEQ"]: + calib_func = None + conf = PostTrainingQuantConfig( - backend="ipex" if args.ipex else "default", approach=args.approach, - excluded_precisions=excluded_precisions, op_type_dict=op_type_dict, - op_name_dict={ - '.*lm_head':{ # re.match - "weight": { - 'dtype': 'fp32' - }, - }, - }, + op_name_dict=op_name_dict, recipes=recipes, ) else: + if re.search("gpt", user_model.config.model_type): + op_type_dict = { + "add": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}}, + } + else: + op_type_dict = {} + excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"] + if args.sq: + # alpha can be a float number of a list of float number. + args.alpha = args.alpha if args.alpha == "auto" else eval(args.alpha) + if re.search("falcon", user_model.config.model_type): + recipes = {"smooth_quant": True, "smooth_quant_args": {'alpha': args.alpha, 'folding': False}} + else: + recipes = {"smooth_quant": True, "smooth_quant_args": {'alpha': args.alpha}} + conf = PostTrainingQuantConfig( backend="ipex" if args.ipex else "default", approach=args.approach, @@ -300,20 +292,13 @@ def calib_func(prepared_model): recipes=recipes, ) - # when GPTQ is enabled: use assistive functions to modify calib_dataloader and calib_func - if args.weight_only_algo == "GPTQ": - calib_func = None - if args.weight_only_algo == 'TEQ': - # set calib_func=None, use default training func as calib_func - calib_func = None - - eval_dataset = load_dataset('lambada', split='validation') - evaluator = Evaluator(eval_dataset, tokenizer) - def eval_func(model): - acc = evaluator.evaluate(model) - return acc - # eval_func should be set when tuning alpha. - eval_func = eval_func if isinstance(args.alpha, list) else None + # eval_func should be set when tuning alpha. + if isinstance(args.alpha, list): + eval_dataset = load_dataset('lambada', split='validation') + evaluator = Evaluator(eval_dataset, tokenizer) + def eval_func(model): + acc = evaluator.evaluate(model) + return acc q_model = quantization.fit( user_model, @@ -339,7 +324,7 @@ def eval_func(model): if args.accuracy: user_model.eval() - from intel_extension_for_transformers.evaluation.lm_eval import evaluate + from intel_extension_for_transformers.llm.evaluation.lm_eval import evaluate results = evaluate( model="hf-causal", model_args='pretrained='+args.model+',tokenizer='+args.model+',dtype=float32', diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh b/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh index e363363cafe..003996a6913 100644 --- a/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh +++ b/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh @@ -145,13 +145,36 @@ function run_tuning { DATASET_NAME="NeelNanda/pile-10k" model_name_or_path="facebook/opt-125m" approach="weight_only" - extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo RTN" + extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo RTN --weight_only_mse_range" elif [ "${topology}" = "opt_125m_weight_only_awq" ]; then script="run_clm_no_trainer.py" DATASET_NAME="NeelNanda/pile-10k" model_name_or_path="facebook/opt-125m" approach="weight_only" extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo AWQ --calib_iters 128" + elif [ "${topology}" = "opt_125m_weight_only_gptq" ]; then + script="run_clm_no_trainer.py" + DATASET_NAME="NeelNanda/pile-10k" + model_name_or_path="facebook/opt-125m" + approach="weight_only" + extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo GPTQ" + elif [ "${topology}" = "opt_125m_weight_only_teq" ]; then + script="run_clm_no_trainer.py" + DATASET_NAME="NeelNanda/pile-10k" + model_name_or_path="facebook/opt-125m" + approach="weight_only" + extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo TEQ" + elif [ "${topology}" = "opt_125m" ]; then + if [ "${backend}" = "ipex" ]; then + extra_cmd=$extra_cmd" --ipex" + fi + script="run_clm_no_trainer.py" + DATASET_NAME="NeelNanda/pile-10k" + model_name_or_path="facebook/opt-125m" + approach="PostTrainingStatic" + extra_cmd=$extra_cmd" --int8_bf16_mixed" + alpha=0.8 + extra_cmd=$extra_cmd" --sq --alpha "${alpha} elif [ "${topology}" = "opt_1.3b" ]; then if [ "${backend}" = "ipex" ]; then extra_cmd=$extra_cmd" --ipex" From 272b8da360bdc16c32c1f1cee86c3b9208d8a954 Mon Sep 17 00:00:00 2001 From: Xin He Date: Fri, 1 Sep 2023 09:11:48 +0800 Subject: [PATCH 2/4] add gptq args Signed-off-by: Xin He --- .../language-modeling/quantization/run_clm_no_trainer.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py b/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py index 7e8cf77abfd..43520a189ed 100644 --- a/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py +++ b/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py @@ -257,6 +257,13 @@ def calib_func(prepared_model): "mse_range": args.weight_only_mse_range, "sym_full_range": args.weight_only_sym_full_range, } + recipes['gptq_args'] = { + 'percdamp': args.gptq_percdamp, + 'act_order':args.gptq_actorder, + 'block_size': args.gptq_block_size, + 'nsamples': args.gptq_nsamples, + 'use_max_length': args.gptq_use_max_length + } # GPTQ: use assistive functions to modify calib_dataloader and calib_func # TEQ: set calib_func=None, use default training func as calib_func if args.weight_only_algo in ["GPTQ", "TEQ"]: From e5b3bffdbb3ff6b2864bc7e3dfd96a007508b831 Mon Sep 17 00:00:00 2001 From: Xin He Date: Sun, 3 Sep 2023 16:58:18 +0800 Subject: [PATCH 3/4] use woq as args name Signed-off-by: Xin He --- examples/.config/pytorch_optimize.json | 42 +++++++++---------- .../quantization/run_benchmark.sh | 14 +++---- .../quantization/run_clm_no_trainer.py | 32 +++++++------- .../quantization/run_tuning.sh | 26 ++++++------ 4 files changed, 57 insertions(+), 57 deletions(-) diff --git a/examples/.config/pytorch_optimize.json b/examples/.config/pytorch_optimize.json index 4ee6ce1ef1d..8d1b8c627c1 100644 --- a/examples/.config/pytorch_optimize.json +++ b/examples/.config/pytorch_optimize.json @@ -1465,12 +1465,12 @@ } } }, - "gpt_j_6b_clm_weight_only": { + "gpt_j_6b_clm_woq": { "working_dir": "huggingface/pytorch/language-modeling/quantization", "tune": { "cmd": "bash run_tuning.sh", "params": { - "topology": "gpt_j_weight_only", + "topology": "gpt_j_woq", "task": "clm", "approach": "weight_only", "output_model": "saved_results" @@ -1479,7 +1479,7 @@ "benchmark": { "cmd": "bash run_benchmark.sh", "params": { - "topology": "gpt_j_weight_only", + "topology": "gpt_j_woq", "task": "clm", "mode": "accuracy", "batch_size": "112", @@ -1489,12 +1489,12 @@ } } }, - "gpt_j_6b_clm_weight_only_awq": { + "gpt_j_6b_clm_woq_awq": { "working_dir": "huggingface/pytorch/language-modeling/quantization", "tune": { "cmd": "bash run_tuning.sh", "params": { - "topology": "gpt_j_weight_only_awq", + "topology": "gpt_j_woq_awq", "task": "clm", "approach": "weight_only", "output_model": "saved_results" @@ -1503,7 +1503,7 @@ "benchmark": { "cmd": "bash run_benchmark.sh", "params": { - "topology": "gpt_j_weight_only_awq", + "topology": "gpt_j_woq_awq", "task": "clm", "mode": "accuracy", "batch_size": "112", @@ -1592,12 +1592,12 @@ } } }, - "chatglm_clm_weight_only": { + "chatglm_clm_woq": { "working_dir": "huggingface/pytorch/language-modeling/quantization", "tune": { "cmd": "bash run_tuning.sh", "params": { - "topology": "chatglm_weight_only", + "topology": "chatglm_woq", "task": "clm", "approach": "weight_only", "output_model": "saved_results" @@ -1606,7 +1606,7 @@ "benchmark": { "cmd": "bash run_benchmark.sh", "params": { - "topology": "chatglm_weight_only", + "topology": "chatglm_woq", "task": "clm", "mode": "accuracy", "batch_size": "112", @@ -1616,12 +1616,12 @@ } } }, - "opt_125m_clm_weight_only": { + "opt_125m_clm_woq": { "working_dir": "huggingface/pytorch/language-modeling/quantization", "tune": { "cmd": "bash run_tuning.sh", "params": { - "topology": "opt_125m_weight_only", + "topology": "opt_125m_woq", "task": "clm", "approach": "weight_only", "output_model": "saved_results" @@ -1630,7 +1630,7 @@ "benchmark": { "cmd": "bash run_benchmark.sh", "params": { - "topology": "opt_125m_weight_only", + "topology": "opt_125m_woq", "task": "clm", "mode": "accuracy", "batch_size": "112", @@ -1640,12 +1640,12 @@ } } }, - "opt_125m_clm_weight_only_awq": { + "opt_125m_clm_woq_awq": { "working_dir": "huggingface/pytorch/language-modeling/quantization", "tune": { "cmd": "bash run_tuning.sh", "params": { - "topology": "opt_125m_weight_only_awq", + "topology": "opt_125m_woq_awq", "task": "clm", "approach": "weight_only", "output_model": "saved_results" @@ -1654,7 +1654,7 @@ "benchmark": { "cmd": "bash run_benchmark.sh", "params": { - "topology": "opt_125m_weight_only_awq", + "topology": "opt_125m_woq_awq", "task": "clm", "mode": "accuracy", "batch_size": "112", @@ -1664,12 +1664,12 @@ } } }, - "opt_125m_clm_weight_only_gptq": { + "opt_125m_clm_woq_gptq": { "working_dir": "huggingface/pytorch/language-modeling/quantization", "tune": { "cmd": "bash run_tuning.sh", "params": { - "topology": "opt_125m_weight_only_gptq", + "topology": "opt_125m_woq_gptq", "task": "clm", "approach": "weight_only", "output_model": "saved_results" @@ -1678,7 +1678,7 @@ "benchmark": { "cmd": "bash run_benchmark.sh", "params": { - "topology": "opt_125m_weight_only_gptq", + "topology": "opt_125m_woq_gptq", "task": "clm", "mode": "accuracy", "batch_size": "112", @@ -1688,12 +1688,12 @@ } } }, - "opt_125m_clm_weight_only_teq": { + "opt_125m_clm_woq_teq": { "working_dir": "huggingface/pytorch/language-modeling/quantization", "tune": { "cmd": "bash run_tuning.sh", "params": { - "topology": "opt_125m_weight_only_teq", + "topology": "opt_125m_woq_teq", "task": "clm", "approach": "weight_only", "output_model": "saved_results" @@ -1702,7 +1702,7 @@ "benchmark": { "cmd": "bash run_benchmark.sh", "params": { - "topology": "opt_125m_weight_only_teq", + "topology": "opt_125m_woq_teq", "task": "clm", "mode": "accuracy", "batch_size": "112", diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_benchmark.sh b/examples/huggingface/pytorch/language-modeling/quantization/run_benchmark.sh index 9f9116ab3f9..24912b52f5e 100644 --- a/examples/huggingface/pytorch/language-modeling/quantization/run_benchmark.sh +++ b/examples/huggingface/pytorch/language-modeling/quantization/run_benchmark.sh @@ -83,17 +83,17 @@ function run_benchmark { model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B" extra_cmd=$extra_cmd" --ipex" fi - elif [ "${topology}" = "gpt_j_weight_only" ]; then + elif [ "${topology}" = "gpt_j_woq" ]; then script="run_clm_no_trainer.py" model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B" lm_eval_tasks="lambada_openai" extra_cmd=$extra_cmd" --approach weight_only" - elif [ "${topology}" = "chatglm_weight_only" ]; then + elif [ "${topology}" = "chatglm_woq" ]; then script="run_clm_no_trainer.py" model_name_or_path="THUDM/chatglm-6b" lm_eval_tasks="lambada_openai" extra_cmd=$extra_cmd" --approach weight_only" - elif [ "${topology}" = "gpt_j_weight_only_awq" ]; then + elif [ "${topology}" = "gpt_j_woq_awq" ]; then script="run_clm_no_trainer.py" model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B" lm_eval_tasks="lambada_openai" @@ -107,10 +107,10 @@ function run_benchmark { elif [ "${topology}" = "falcon_7b_instruct" ]; then script="run_clm_no_trainer.py" model_name_or_path="tiiuae/falcon-7b-instruct" - elif [ "${topology}" = "opt_125m_weight_only" -o \ - "${topology}" = "opt_125m_weight_only_awq" -o \ - "${topology}" = "opt_125m_weight_only_gptq" -o \ - "${topology}" = "opt_125m_weight_only_teq" ]; then + elif [ "${topology}" = "opt_125m_woq" -o \ + "${topology}" = "opt_125m_woq_awq" -o \ + "${topology}" = "opt_125m_woq_gptq" -o \ + "${topology}" = "opt_125m_woq_teq" ]; then script="run_clm_no_trainer.py" model_name_or_path="facebook/opt-125m" lm_eval_tasks="lambada_openai" diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py b/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py index 43520a189ed..89cba83383c 100644 --- a/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py +++ b/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py @@ -50,13 +50,13 @@ parser.add_argument("--sq", action="store_true") parser.add_argument("--alpha", default="auto", help="Smooth quant parameter.") # ============WeightOnly configs=============== -parser.add_argument("--weight_only_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ'], +parser.add_argument("--woq_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ'], help="Weight-only parameter.") -parser.add_argument("--weight_only_bits", type=int, default=8) -parser.add_argument("--weight_only_group", type=int, default=-1) -parser.add_argument("--weight_only_scheme", default="sym") -parser.add_argument("--weight_only_mse_range", action="store_true") -parser.add_argument("--weight_only_sym_full_range", action="store_true") +parser.add_argument("--woq_bits", type=int, default=8) +parser.add_argument("--woq_group_size", type=int, default=-1) +parser.add_argument("--woq_scheme", default="sym") +parser.add_argument("--woq_mse_range", action="store_true") +parser.add_argument("--woq_sym_full_range", action="store_true") # =============GPTQ configs==================== parser.add_argument("--gptq_actorder", action="store_true", help="Whether to apply the activation order GPTQ heuristic.") parser.add_argument('--gptq_percdamp', type=float, default=.01, help='Percent of the average Hessian diagonal to use for dampening.') @@ -88,7 +88,7 @@ def __init__(self, dataset, tokenizer, batch_size=8, pad_val=1, pad_max=196, is_ @torch.no_grad() def tokenize_function(self, examples): - if args.weight_only_algo in ['TEQ']: + if args.woq_algo in ['TEQ']: if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token example = self.tokenizer(examples["text"], padding="max_length", max_length=self.pad_max) @@ -153,7 +153,7 @@ def evaluate(self, model): def get_user_model(): from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer torchscript = False - if args.sq or args.weight_only_algo in ['AWQ', 'TEQ']: + if args.sq or args.woq_algo in ['AWQ', 'TEQ']: torchscript = True if re.search("llama", args.model.lower()): import transformers @@ -202,7 +202,7 @@ def get_user_model(): tokenizer = AutoTokenizer.from_pretrained(args.model) # Set model's seq_len when GPTQ calibration is enabled. - if args.weight_only_algo == 'GPTQ': + if args.woq_algo == 'GPTQ': user_model.seqlen = args.gptq_pad_max_length if args.peft_model_id is not None: @@ -242,10 +242,10 @@ def calib_func(prepared_model): op_type_dict = { '.*':{ # re.match "weight": { - 'bits': args.weight_only_bits, # 1-8 bits - 'group_size': args.weight_only_group, # -1 (per-channel) - 'scheme': args.weight_only_scheme, # sym/asym - 'algorithm': args.weight_only_algo, # RTN/AWQ/TEQ + 'bits': args.woq_bits, # 1-8 bits + 'group_size': args.woq_group_size, # -1 (per-channel) + 'scheme': args.woq_scheme, # sym/asym + 'algorithm': args.woq_algo, # RTN/AWQ/TEQ }, }, } @@ -254,8 +254,8 @@ def calib_func(prepared_model): 'embed_out':{"weight": {'dtype': 'fp32'},}, # for dolly_v2 } recipes["rtn_args"] = { - "mse_range": args.weight_only_mse_range, - "sym_full_range": args.weight_only_sym_full_range, + "mse_range": args.woq_mse_range, + "sym_full_range": args.woq_sym_full_range, } recipes['gptq_args'] = { 'percdamp': args.gptq_percdamp, @@ -266,7 +266,7 @@ def calib_func(prepared_model): } # GPTQ: use assistive functions to modify calib_dataloader and calib_func # TEQ: set calib_func=None, use default training func as calib_func - if args.weight_only_algo in ["GPTQ", "TEQ"]: + if args.woq_algo in ["GPTQ", "TEQ"]: calib_func = None conf = PostTrainingQuantConfig( diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh b/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh index 003996a6913..250d31780be 100644 --- a/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh +++ b/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh @@ -106,24 +106,24 @@ function run_tuning { extra_cmd=$extra_cmd" --int8_bf16_mixed" extra_cmd=$extra_cmd" --sq --alpha "${alpha} fi - elif [ "${topology}" = "gpt_j_weight_only" ]; then + elif [ "${topology}" = "gpt_j_woq" ]; then script="run_clm_no_trainer.py" DATASET_NAME="NeelNanda/pile-10k" model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B" approach="weight_only" extra_cmd=$extra_cmd" --approach weight_only" - elif [ "${topology}" = "chatglm_weight_only" ]; then + elif [ "${topology}" = "chatglm_woq" ]; then script="run_clm_no_trainer.py" DATASET_NAME="NeelNanda/pile-10k" model_name_or_path="THUDM/chatglm-6b" approach="weight_only" - extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo RTN" - elif [ "${topology}" = "gpt_j_weight_only_awq" ]; then + extra_cmd=$extra_cmd" --approach weight_only --woq_algo RTN" + elif [ "${topology}" = "gpt_j_woq_awq" ]; then script="run_clm_no_trainer.py" DATASET_NAME="NeelNanda/pile-10k" model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B" approach="weight_only" - extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo AWQ --calib_iters 128" + extra_cmd=$extra_cmd" --approach weight_only --woq_algo AWQ --calib_iters 128" elif [ "${topology}" = "mpt_7b_chat" ]; then if [ "${backend}" = "ipex" ]; then extra_cmd=$extra_cmd" --ipex" @@ -140,30 +140,30 @@ function run_tuning { approach="PostTrainingStatic" alpha=0.7 extra_cmd=$extra_cmd" --sq --alpha "${alpha} - elif [ "${topology}" = "opt_125m_weight_only" ]; then + elif [ "${topology}" = "opt_125m_woq" ]; then script="run_clm_no_trainer.py" DATASET_NAME="NeelNanda/pile-10k" model_name_or_path="facebook/opt-125m" approach="weight_only" - extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo RTN --weight_only_mse_range" - elif [ "${topology}" = "opt_125m_weight_only_awq" ]; then + extra_cmd=$extra_cmd" --approach weight_only --woq_algo RTN --woq_mse_range" + elif [ "${topology}" = "opt_125m_woq_awq" ]; then script="run_clm_no_trainer.py" DATASET_NAME="NeelNanda/pile-10k" model_name_or_path="facebook/opt-125m" approach="weight_only" - extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo AWQ --calib_iters 128" - elif [ "${topology}" = "opt_125m_weight_only_gptq" ]; then + extra_cmd=$extra_cmd" --approach weight_only --woq_algo AWQ --calib_iters 128" + elif [ "${topology}" = "opt_125m_woq_gptq" ]; then script="run_clm_no_trainer.py" DATASET_NAME="NeelNanda/pile-10k" model_name_or_path="facebook/opt-125m" approach="weight_only" - extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo GPTQ" - elif [ "${topology}" = "opt_125m_weight_only_teq" ]; then + extra_cmd=$extra_cmd" --approach weight_only --woq_algo GPTQ" + elif [ "${topology}" = "opt_125m_woq_teq" ]; then script="run_clm_no_trainer.py" DATASET_NAME="NeelNanda/pile-10k" model_name_or_path="facebook/opt-125m" approach="weight_only" - extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo TEQ" + extra_cmd=$extra_cmd" --approach weight_only --woq_algo TEQ" elif [ "${topology}" = "opt_125m" ]; then if [ "${backend}" = "ipex" ]; then extra_cmd=$extra_cmd" --ipex" From 626c4a17023be35a5cfb5bc3c5aae95b20eb9959 Mon Sep 17 00:00:00 2001 From: Xin He Date: Mon, 4 Sep 2023 10:40:53 +0800 Subject: [PATCH 4/4] change readme Signed-off-by: Xin He --- .../language-modeling/quantization/README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/huggingface/pytorch/language-modeling/quantization/README.md b/examples/huggingface/pytorch/language-modeling/quantization/README.md index 52629e0afc5..cd6a05cd97e 100644 --- a/examples/huggingface/pytorch/language-modeling/quantization/README.md +++ b/examples/huggingface/pytorch/language-modeling/quantization/README.md @@ -48,11 +48,11 @@ python run_clm_no_trainer.py \ --model EleutherAI/gpt-j-6B \ --quantize \ --approach weight_only \ - --weight_only_bits 4 \ - --weight_only_group 128 \ - --weight_only_scheme asym \ - --weight_only_algo RTN \ - --weight_only_mse_range \ + --woq_bits 4 \ + --woq_group_size 128 \ + --woq_scheme asym \ + --woq_algo RTN \ + --woq_mse_range \ --output_dir "saved_results" ``` **Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md) @@ -61,8 +61,8 @@ python run_clm_no_trainer.py \ ```bash python run_clm_no_trainer.py \ --model EleutherAI/gpt-j-6B \ - --weight_only_algo GPTQ \ - --weight_only_bits 4 \ + --woq_algo GPTQ \ + --woq_bits 4 \ --quantize \ --pad_max_length 2048 \ --gptq_pad_max_length 2048 \