Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

refine example for SQ and WOQ #209

Merged
merged 6 commits into from
Sep 8, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 78 additions & 3 deletions examples/.config/pytorch_optimize.json
Original file line number Diff line number Diff line change
Expand Up @@ -1592,6 +1592,30 @@
}
}
},
"chatglm_clm_weight_only": {
"working_dir": "huggingface/pytorch/language-modeling/quantization",
"tune": {
"cmd": "bash run_tuning.sh",
"params": {
"topology": "chatglm_weight_only",
"task": "clm",
"approach": "weight_only",
"output_model": "saved_results"
}
},
"benchmark": {
"cmd": "bash run_benchmark.sh",
"params": {
"topology": "chatglm_weight_only",
"task": "clm",
"mode": "accuracy",
"batch_size": "112",
"config": "saved_results",
"iters": "100",
"int8": "false"
}
}
},
"opt_125m_clm_weight_only": {
"working_dir": "huggingface/pytorch/language-modeling/quantization",
"tune": {
Expand Down Expand Up @@ -1640,12 +1664,12 @@
}
}
},
"chatglm_clm_weight_only": {
"opt_125m_clm_weight_only_gptq": {
"working_dir": "huggingface/pytorch/language-modeling/quantization",
"tune": {
"cmd": "bash run_tuning.sh",
"params": {
"topology": "chatglm_weight_only",
"topology": "opt_125m_weight_only_gptq",
"task": "clm",
"approach": "weight_only",
"output_model": "saved_results"
Expand All @@ -1654,7 +1678,31 @@
"benchmark": {
"cmd": "bash run_benchmark.sh",
"params": {
"topology": "chatglm_weight_only",
"topology": "opt_125m_weight_only_gptq",
"task": "clm",
"mode": "accuracy",
"batch_size": "112",
"config": "saved_results",
"iters": "100",
"int8": "false"
}
}
},
"opt_125m_clm_weight_only_teq": {
"working_dir": "huggingface/pytorch/language-modeling/quantization",
"tune": {
"cmd": "bash run_tuning.sh",
"params": {
"topology": "opt_125m_weight_only_teq",
"task": "clm",
"approach": "weight_only",
"output_model": "saved_results"
}
},
"benchmark": {
"cmd": "bash run_benchmark.sh",
"params": {
"topology": "opt_125m_weight_only_teq",
"task": "clm",
"mode": "accuracy",
"batch_size": "112",
Expand All @@ -1664,6 +1712,33 @@
}
}
},
"opt_125m_clm_ipex": {
"working_dir": "huggingface/pytorch/language-modeling/quantization",
"tune": {
"cmd": "bash run_tuning.sh",
"params": {
"topology": "opt_125m",
"task": "clm",
"approach": "static",
"backend": "ipex",
"output_model": "saved_results"
}
},
"benchmark": {
"cmd": "bash run_benchmark.sh",
"params": {
"topology": "opt_125m",
"task": "clm",
"approach": "static",
"backend": "ipex",
"mode": "accuracy",
"batch_size": "112",
"iters": "100",
"int8": "false",
"config": "saved_results"
}
}
},
"opt_1.3b_clm_ipex": {
"working_dir": "huggingface/pytorch/language-modeling/quantization",
"tune": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,16 @@ python run_clm_no_trainer.py \
--model EleutherAI/gpt-j-6B \
--quantize \
--approach weight_only \
--output_dir "saved_results" \
--weight_only_bits 4 \
--weight_only_group 128 \
--weight_only_scheme asym \
--weight_only_algo RTN \
--weight_only_mse_range \
--output_dir "saved_results"
```
**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN/AWQ[1]/GPTQ[2] algorithms. You can try it with `--approach weight_only`. `--awq` will trigger AWQ algorithm. `--gptq` will trigger GPTQ algorithm. For example, to run a GPTQ example, try the following command.
**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md)


```bash
python run_clm_no_trainer.py \
--model EleutherAI/gpt-j-6B \
Expand Down Expand Up @@ -242,5 +249,5 @@ python run_mlm.py \
--overwrite_output_dir
```

[1]. Lin, Ji, et al. "AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration." arXiv preprint arXiv:2306.00978 (2023).
[2]. Elias, Frantar, et al. "GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers." arXiv preprint arXiv:2210.17323 (2023).
[1]. Elias, Frantar, et al. "GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers." arXiv preprint arXiv:2210.17323 (2023).
[2]. Lin, Ji, et al. "AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration." arXiv preprint arXiv:2306.00978 (2023).
Original file line number Diff line number Diff line change
Expand Up @@ -107,16 +107,20 @@ function run_benchmark {
elif [ "${topology}" = "falcon_7b_instruct" ]; then
script="run_clm_no_trainer.py"
model_name_or_path="tiiuae/falcon-7b-instruct"
elif [ "${topology}" = "opt_125m_weight_only" ]; then
elif [ "${topology}" = "opt_125m_weight_only" -o \
"${topology}" = "opt_125m_weight_only_awq" -o \
"${topology}" = "opt_125m_weight_only_gptq" -o \
"${topology}" = "opt_125m_weight_only_teq" ]; then
script="run_clm_no_trainer.py"
model_name_or_path="facebook/opt-125m"
lm_eval_tasks="lambada_openai"
extra_cmd=$extra_cmd" --approach weight_only"
elif [ "${topology}" = "opt_125m_weight_only_awq" ]; then
elif [ "${topology}" = "opt_125m" ]; then
script="run_clm_no_trainer.py"
model_name_or_path="facebook/opt-125m"
lm_eval_tasks="lambada_openai"
extra_cmd=$extra_cmd" --approach weight_only"
if [ "${backend}" = "ipex" ]; then
extra_cmd=$extra_cmd" --ipex"
fi
elif [ "${topology}" = "opt_1.3b" ]; then
script="run_clm_no_trainer.py"
model_name_or_path="facebook/opt-1.3b"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,19 +31,6 @@
)
parser.add_argument("--approach", type=str, default='static',
help="Select from ['dynamic', 'static', 'weight-only']")
parser.add_argument("--sq", action="store_true")
parser.add_argument("--alpha", default="auto",
help="Smooth quant parameter.")
# ============gptq configs===============
parser.add_argument("--gptq_actorder", action="store_true", help="Whether to apply the activation order GPTQ heuristic.")
parser.add_argument('--gptq_percdamp', type=float, default=.01, help='Percent of the average Hessian diagonal to use for dampening.')
parser.add_argument('--gptq_block_size', type=int, default=128, help='Block size. sub weight matrix size to run GPTQ.')
parser.add_argument('--gptq_nsamples', type=int, default=128, help='Number of calibration data samples.')
parser.add_argument('--gptq_use_max_length', action="store_true", help='Set all sequence length to be same length of args.gptq_pad_max_length')
parser.add_argument('--gptq_pad_max_length', type=int, default=2048, help='Calibration dataset sequence max length, this should align with your model config, and your dataset builder args: args.pad_max_length')
# =======================================
parser.add_argument("--weight_only_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ'],
help="Weight-only parameter.")
parser.add_argument("--int8", action="store_true")
parser.add_argument("--ipex", action="store_true", help="Use intel extension for pytorch.")
parser.add_argument("--accuracy", action="store_true")
Expand All @@ -58,11 +45,28 @@
parser.add_argument("--tasks", nargs='+', default=["lambada_openai",
"hellaswag","winogrande","piqa","wikitext"],
type=str, help="tasks list for accuracy validation")
parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model")
# ============SmoothQuant configs==============
parser.add_argument("--sq", action="store_true")
parser.add_argument("--alpha", default="auto", help="Smooth quant parameter.")
# ============WeightOnly configs===============
parser.add_argument("--weight_only_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ'],
help="Weight-only parameter.")
parser.add_argument("--weight_only_bits", type=int, default=8)
parser.add_argument("--weight_only_group", type=int, default=-1)
parser.add_argument("--weight_only_scheme", default="sym")
parser.add_argument("--weight_only_mse_range", action="store_true")
xin3he marked this conversation as resolved.
Show resolved Hide resolved
parser.add_argument("--weight_only_sym_full_range", action="store_true")
parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model")
# =============GPTQ configs====================
parser.add_argument("--gptq_actorder", action="store_true", help="Whether to apply the activation order GPTQ heuristic.")
parser.add_argument('--gptq_percdamp', type=float, default=.01, help='Percent of the average Hessian diagonal to use for dampening.')
parser.add_argument('--gptq_block_size', type=int, default=128, help='Block size. sub weight matrix size to run GPTQ.')
parser.add_argument('--gptq_nsamples', type=int, default=128, help='Number of calibration data samples.')
parser.add_argument('--gptq_use_max_length', action="store_true", help='Set all sequence length to be same length of args.gptq_pad_max_length')
parser.add_argument('--gptq_pad_max_length', type=int, default=2048, help='Calibration dataset sequence max length, \
this should align with your model config, \
and your dataset builder args: args.pad_max_length')
# =======================================

args = parser.parse_args()
if args.ipex:
Expand Down Expand Up @@ -231,7 +235,9 @@ def calib_func(prepared_model):
prepared_model(calib_input[0])

recipes = {}
eval_func = None
from neural_compressor import PostTrainingQuantConfig, quantization
# specify the op_type_dict and op_name_dict
if args.approach == 'weight_only':
op_type_dict = {
'.*':{ # re.match
Expand All @@ -243,55 +249,41 @@ def calib_func(prepared_model):
},
},
}
if args.weight_only_sym_full_range:
recipes.update({"rtn_args": {"sym_full_range": True}})
else:
if re.search("gpt", user_model.config.model_type):
op_type_dict = {
"add": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}},
}
else:
op_type_dict = {}
excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"]
if args.sq:
# alpha can be a float number of a list of float number.
args.alpha = args.alpha if args.alpha == "auto" else eval(args.alpha)
if re.search("falcon", user_model.config.model_type):
recipes = {"smooth_quant": True, "smooth_quant_args": {'alpha': args.alpha, 'folding': False}}
else:
recipes = {"smooth_quant": True, "smooth_quant_args": {'alpha': args.alpha}}
conf = PostTrainingQuantConfig(
backend="ipex" if args.ipex else "default",
approach=args.approach,
excluded_precisions=excluded_precisions,
op_type_dict=op_type_dict,
recipes=recipes,
)
elif args.weight_only_algo == "GPTQ":
recipes = {
'gptq_args': {
'percdamp': args.gptq_percdamp,
'act_order':args.gptq_actorder,
'block_size': args.gptq_block_size,
'nsamples': args.gptq_nsamples,
'use_max_length': args.gptq_use_max_length
}
op_name_dict={
'lm_head':{"weight": {'dtype': 'fp32'},},
'embed_out':{"weight": {'dtype': 'fp32'},}, # for dolly_v2
}
recipes["rtn_args"] = {
"mse_range": args.weight_only_mse_range,
"sym_full_range": args.weight_only_sym_full_range,
}
# GPTQ: use assistive functions to modify calib_dataloader and calib_func
# TEQ: set calib_func=None, use default training func as calib_func
if args.weight_only_algo in ["GPTQ", "TEQ"]:
calib_func = None

conf = PostTrainingQuantConfig(
backend="ipex" if args.ipex else "default",
approach=args.approach,
excluded_precisions=excluded_precisions,
op_type_dict=op_type_dict,
op_name_dict={
'.*lm_head':{ # re.match
"weight": {
'dtype': 'fp32'
},
},
},
op_name_dict=op_name_dict,
recipes=recipes,
)
else:
if re.search("gpt", user_model.config.model_type):
op_type_dict = {
"add": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}},
}
else:
op_type_dict = {}
excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"]
if args.sq:
# alpha can be a float number of a list of float number.
args.alpha = args.alpha if args.alpha == "auto" else eval(args.alpha)
if re.search("falcon", user_model.config.model_type):
recipes = {"smooth_quant": True, "smooth_quant_args": {'alpha': args.alpha, 'folding': False}}
else:
recipes = {"smooth_quant": True, "smooth_quant_args": {'alpha': args.alpha}}

conf = PostTrainingQuantConfig(
backend="ipex" if args.ipex else "default",
approach=args.approach,
Expand All @@ -300,20 +292,13 @@ def calib_func(prepared_model):
recipes=recipes,
)

# when GPTQ is enabled: use assistive functions to modify calib_dataloader and calib_func
if args.weight_only_algo == "GPTQ":
calib_func = None
if args.weight_only_algo == 'TEQ':
# set calib_func=None, use default training func as calib_func
calib_func = None

eval_dataset = load_dataset('lambada', split='validation')
evaluator = Evaluator(eval_dataset, tokenizer)
def eval_func(model):
acc = evaluator.evaluate(model)
return acc
# eval_func should be set when tuning alpha.
eval_func = eval_func if isinstance(args.alpha, list) else None
# eval_func should be set when tuning alpha.
if isinstance(args.alpha, list):
eval_dataset = load_dataset('lambada', split='validation')
evaluator = Evaluator(eval_dataset, tokenizer)
def eval_func(model):
acc = evaluator.evaluate(model)
return acc

q_model = quantization.fit(
user_model,
Expand All @@ -339,7 +324,7 @@ def eval_func(model):

if args.accuracy:
user_model.eval()
from intel_extension_for_transformers.evaluation.lm_eval import evaluate
from intel_extension_for_transformers.llm.evaluation.lm_eval import evaluate
results = evaluate(
model="hf-causal",
model_args='pretrained='+args.model+',tokenizer='+args.model+',dtype=float32',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,13 +145,36 @@ function run_tuning {
DATASET_NAME="NeelNanda/pile-10k"
model_name_or_path="facebook/opt-125m"
approach="weight_only"
extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo RTN"
extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo RTN --weight_only_mse_range"
elif [ "${topology}" = "opt_125m_weight_only_awq" ]; then
script="run_clm_no_trainer.py"
DATASET_NAME="NeelNanda/pile-10k"
model_name_or_path="facebook/opt-125m"
approach="weight_only"
extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo AWQ --calib_iters 128"
elif [ "${topology}" = "opt_125m_weight_only_gptq" ]; then
script="run_clm_no_trainer.py"
DATASET_NAME="NeelNanda/pile-10k"
model_name_or_path="facebook/opt-125m"
approach="weight_only"
extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo GPTQ"
elif [ "${topology}" = "opt_125m_weight_only_teq" ]; then
script="run_clm_no_trainer.py"
DATASET_NAME="NeelNanda/pile-10k"
model_name_or_path="facebook/opt-125m"
approach="weight_only"
extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo TEQ"
elif [ "${topology}" = "opt_125m" ]; then
if [ "${backend}" = "ipex" ]; then
extra_cmd=$extra_cmd" --ipex"
fi
script="run_clm_no_trainer.py"
DATASET_NAME="NeelNanda/pile-10k"
model_name_or_path="facebook/opt-125m"
approach="PostTrainingStatic"
extra_cmd=$extra_cmd" --int8_bf16_mixed"
alpha=0.8
extra_cmd=$extra_cmd" --sq --alpha "${alpha}
elif [ "${topology}" = "opt_1.3b" ]; then
if [ "${backend}" = "ipex" ]; then
extra_cmd=$extra_cmd" --ipex"
Expand Down