Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

Commit

Permalink
refine example for SQ and WOQ (#209)
Browse files Browse the repository at this point in the history
* refine example for SQ and WOQ

* add gptq args

* use woq as args name

* change readme

---------

Signed-off-by: Xin He <xin3.he@intel.com>
  • Loading branch information
xin3he authored Sep 8, 2023
1 parent 2651bd8 commit 1bcab14
Show file tree
Hide file tree
Showing 5 changed files with 213 additions and 112 deletions.
105 changes: 90 additions & 15 deletions examples/.config/pytorch_optimize.json
Original file line number Diff line number Diff line change
Expand Up @@ -1465,12 +1465,12 @@
}
}
},
"gpt_j_6b_clm_weight_only": {
"gpt_j_6b_clm_woq": {
"working_dir": "huggingface/pytorch/language-modeling/quantization",
"tune": {
"cmd": "bash run_tuning.sh",
"params": {
"topology": "gpt_j_weight_only",
"topology": "gpt_j_woq",
"task": "clm",
"approach": "weight_only",
"output_model": "saved_results"
Expand All @@ -1479,7 +1479,7 @@
"benchmark": {
"cmd": "bash run_benchmark.sh",
"params": {
"topology": "gpt_j_weight_only",
"topology": "gpt_j_woq",
"task": "clm",
"mode": "accuracy",
"batch_size": "112",
Expand All @@ -1489,12 +1489,12 @@
}
}
},
"gpt_j_6b_clm_weight_only_awq": {
"gpt_j_6b_clm_woq_awq": {
"working_dir": "huggingface/pytorch/language-modeling/quantization",
"tune": {
"cmd": "bash run_tuning.sh",
"params": {
"topology": "gpt_j_weight_only_awq",
"topology": "gpt_j_woq_awq",
"task": "clm",
"approach": "weight_only",
"output_model": "saved_results"
Expand All @@ -1503,7 +1503,7 @@
"benchmark": {
"cmd": "bash run_benchmark.sh",
"params": {
"topology": "gpt_j_weight_only_awq",
"topology": "gpt_j_woq_awq",
"task": "clm",
"mode": "accuracy",
"batch_size": "112",
Expand Down Expand Up @@ -1592,12 +1592,12 @@
}
}
},
"opt_125m_clm_weight_only": {
"chatglm_clm_woq": {
"working_dir": "huggingface/pytorch/language-modeling/quantization",
"tune": {
"cmd": "bash run_tuning.sh",
"params": {
"topology": "opt_125m_weight_only",
"topology": "chatglm_woq",
"task": "clm",
"approach": "weight_only",
"output_model": "saved_results"
Expand All @@ -1606,7 +1606,7 @@
"benchmark": {
"cmd": "bash run_benchmark.sh",
"params": {
"topology": "opt_125m_weight_only",
"topology": "chatglm_woq",
"task": "clm",
"mode": "accuracy",
"batch_size": "112",
Expand All @@ -1616,12 +1616,12 @@
}
}
},
"opt_125m_clm_weight_only_awq": {
"opt_125m_clm_woq": {
"working_dir": "huggingface/pytorch/language-modeling/quantization",
"tune": {
"cmd": "bash run_tuning.sh",
"params": {
"topology": "opt_125m_weight_only_awq",
"topology": "opt_125m_woq",
"task": "clm",
"approach": "weight_only",
"output_model": "saved_results"
Expand All @@ -1630,7 +1630,7 @@
"benchmark": {
"cmd": "bash run_benchmark.sh",
"params": {
"topology": "opt_125m_weight_only_awq",
"topology": "opt_125m_woq",
"task": "clm",
"mode": "accuracy",
"batch_size": "112",
Expand All @@ -1640,12 +1640,12 @@
}
}
},
"chatglm_clm_weight_only": {
"opt_125m_clm_woq_awq": {
"working_dir": "huggingface/pytorch/language-modeling/quantization",
"tune": {
"cmd": "bash run_tuning.sh",
"params": {
"topology": "chatglm_weight_only",
"topology": "opt_125m_woq_awq",
"task": "clm",
"approach": "weight_only",
"output_model": "saved_results"
Expand All @@ -1654,7 +1654,7 @@
"benchmark": {
"cmd": "bash run_benchmark.sh",
"params": {
"topology": "chatglm_weight_only",
"topology": "opt_125m_woq_awq",
"task": "clm",
"mode": "accuracy",
"batch_size": "112",
Expand All @@ -1664,6 +1664,81 @@
}
}
},
"opt_125m_clm_woq_gptq": {
"working_dir": "huggingface/pytorch/language-modeling/quantization",
"tune": {
"cmd": "bash run_tuning.sh",
"params": {
"topology": "opt_125m_woq_gptq",
"task": "clm",
"approach": "weight_only",
"output_model": "saved_results"
}
},
"benchmark": {
"cmd": "bash run_benchmark.sh",
"params": {
"topology": "opt_125m_woq_gptq",
"task": "clm",
"mode": "accuracy",
"batch_size": "112",
"config": "saved_results",
"iters": "100",
"int8": "false"
}
}
},
"opt_125m_clm_woq_teq": {
"working_dir": "huggingface/pytorch/language-modeling/quantization",
"tune": {
"cmd": "bash run_tuning.sh",
"params": {
"topology": "opt_125m_woq_teq",
"task": "clm",
"approach": "weight_only",
"output_model": "saved_results"
}
},
"benchmark": {
"cmd": "bash run_benchmark.sh",
"params": {
"topology": "opt_125m_woq_teq",
"task": "clm",
"mode": "accuracy",
"batch_size": "112",
"config": "saved_results",
"iters": "100",
"int8": "false"
}
}
},
"opt_125m_clm_ipex": {
"working_dir": "huggingface/pytorch/language-modeling/quantization",
"tune": {
"cmd": "bash run_tuning.sh",
"params": {
"topology": "opt_125m",
"task": "clm",
"approach": "static",
"backend": "ipex",
"output_model": "saved_results"
}
},
"benchmark": {
"cmd": "bash run_benchmark.sh",
"params": {
"topology": "opt_125m",
"task": "clm",
"approach": "static",
"backend": "ipex",
"mode": "accuracy",
"batch_size": "112",
"iters": "100",
"int8": "false",
"config": "saved_results"
}
}
},
"opt_1.3b_clm_ipex": {
"working_dir": "huggingface/pytorch/language-modeling/quantization",
"tune": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,21 @@ python run_clm_no_trainer.py \
--model EleutherAI/gpt-j-6B \
--quantize \
--approach weight_only \
--output_dir "saved_results" \
--woq_bits 4 \
--woq_group_size 128 \
--woq_scheme asym \
--woq_algo RTN \
--woq_mse_range \
--output_dir "saved_results"
```
**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN/AWQ[1]/GPTQ[2] algorithms. You can try it with `--approach weight_only`. `--awq` will trigger AWQ algorithm. `--gptq` will trigger GPTQ algorithm. For example, to run a GPTQ example, try the following command.
**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md)


```bash
python run_clm_no_trainer.py \
--model EleutherAI/gpt-j-6B \
--weight_only_algo GPTQ \
--weight_only_bits 4 \
--woq_algo GPTQ \
--woq_bits 4 \
--quantize \
--pad_max_length 2048 \
--gptq_pad_max_length 2048 \
Expand Down Expand Up @@ -242,5 +249,5 @@ python run_mlm.py \
--overwrite_output_dir
```

[1]. Lin, Ji, et al. "AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration." arXiv preprint arXiv:2306.00978 (2023).
[2]. Elias, Frantar, et al. "GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers." arXiv preprint arXiv:2210.17323 (2023).
[1]. Elias, Frantar, et al. "GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers." arXiv preprint arXiv:2210.17323 (2023).
[2]. Lin, Ji, et al. "AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration." arXiv preprint arXiv:2306.00978 (2023).
Original file line number Diff line number Diff line change
Expand Up @@ -83,17 +83,17 @@ function run_benchmark {
model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B"
extra_cmd=$extra_cmd" --ipex"
fi
elif [ "${topology}" = "gpt_j_weight_only" ]; then
elif [ "${topology}" = "gpt_j_woq" ]; then
script="run_clm_no_trainer.py"
model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B"
lm_eval_tasks="lambada_openai"
extra_cmd=$extra_cmd" --approach weight_only"
elif [ "${topology}" = "chatglm_weight_only" ]; then
elif [ "${topology}" = "chatglm_woq" ]; then
script="run_clm_no_trainer.py"
model_name_or_path="THUDM/chatglm-6b"
lm_eval_tasks="lambada_openai"
extra_cmd=$extra_cmd" --approach weight_only"
elif [ "${topology}" = "gpt_j_weight_only_awq" ]; then
elif [ "${topology}" = "gpt_j_woq_awq" ]; then
script="run_clm_no_trainer.py"
model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B"
lm_eval_tasks="lambada_openai"
Expand All @@ -107,16 +107,20 @@ function run_benchmark {
elif [ "${topology}" = "falcon_7b_instruct" ]; then
script="run_clm_no_trainer.py"
model_name_or_path="tiiuae/falcon-7b-instruct"
elif [ "${topology}" = "opt_125m_weight_only" ]; then
elif [ "${topology}" = "opt_125m_woq" -o \
"${topology}" = "opt_125m_woq_awq" -o \
"${topology}" = "opt_125m_woq_gptq" -o \
"${topology}" = "opt_125m_woq_teq" ]; then
script="run_clm_no_trainer.py"
model_name_or_path="facebook/opt-125m"
lm_eval_tasks="lambada_openai"
extra_cmd=$extra_cmd" --approach weight_only"
elif [ "${topology}" = "opt_125m_weight_only_awq" ]; then
elif [ "${topology}" = "opt_125m" ]; then
script="run_clm_no_trainer.py"
model_name_or_path="facebook/opt-125m"
lm_eval_tasks="lambada_openai"
extra_cmd=$extra_cmd" --approach weight_only"
if [ "${backend}" = "ipex" ]; then
extra_cmd=$extra_cmd" --ipex"
fi
elif [ "${topology}" = "opt_1.3b" ]; then
script="run_clm_no_trainer.py"
model_name_or_path="facebook/opt-1.3b"
Expand Down
Loading

0 comments on commit 1bcab14

Please sign in to comment.