refine example for SQ and WOQ (#209)

* refine example for SQ and WOQ * add gptq args * use woq as args name * change readme --------- Signed-off-by: Xin He <xin3.he@intel.com>
intel · Sep 8, 2023 · 1bcab14 · 1bcab14
1 parent 2651bd8
commit 1bcab14
Show file tree

Hide file tree

Showing 5 changed files with 213 additions and 112 deletions.
diff --git a/examples/.config/pytorch_optimize.json b/examples/.config/pytorch_optimize.json
@@ -1465,12 +1465,12 @@
       }
     }
   },
-  "gpt_j_6b_clm_weight_only": {
+  "gpt_j_6b_clm_woq": {
     "working_dir": "huggingface/pytorch/language-modeling/quantization",
     "tune": {
       "cmd": "bash run_tuning.sh",
       "params": {
-        "topology": "gpt_j_weight_only",
+        "topology": "gpt_j_woq",
         "task": "clm",
         "approach": "weight_only",
         "output_model": "saved_results"
@@ -1479,7 +1479,7 @@
     "benchmark": {
       "cmd": "bash run_benchmark.sh",
       "params": {
-        "topology": "gpt_j_weight_only",
+        "topology": "gpt_j_woq",
         "task": "clm",
         "mode": "accuracy",
         "batch_size": "112",
@@ -1489,12 +1489,12 @@
       }
     }
   },
-  "gpt_j_6b_clm_weight_only_awq": {
+  "gpt_j_6b_clm_woq_awq": {
     "working_dir": "huggingface/pytorch/language-modeling/quantization",
     "tune": {
       "cmd": "bash run_tuning.sh",
       "params": {
-        "topology": "gpt_j_weight_only_awq",
+        "topology": "gpt_j_woq_awq",
         "task": "clm",
         "approach": "weight_only",
         "output_model": "saved_results"
@@ -1503,7 +1503,7 @@
     "benchmark": {
       "cmd": "bash run_benchmark.sh",
       "params": {
-        "topology": "gpt_j_weight_only_awq",
+        "topology": "gpt_j_woq_awq",
         "task": "clm",
         "mode": "accuracy",
         "batch_size": "112",
@@ -1592,12 +1592,12 @@
       }
     }
   },
-  "opt_125m_clm_weight_only": {
+  "chatglm_clm_woq": {
     "working_dir": "huggingface/pytorch/language-modeling/quantization",
     "tune": {
       "cmd": "bash run_tuning.sh",
       "params": {
-        "topology": "opt_125m_weight_only",
+        "topology": "chatglm_woq",
         "task": "clm",
         "approach": "weight_only",
         "output_model": "saved_results"
@@ -1606,7 +1606,7 @@
     "benchmark": {
       "cmd": "bash run_benchmark.sh",
       "params": {
-        "topology": "opt_125m_weight_only",
+        "topology": "chatglm_woq",
         "task": "clm",
         "mode": "accuracy",
         "batch_size": "112",
@@ -1616,12 +1616,12 @@
       }
     }
   },
-  "opt_125m_clm_weight_only_awq": {
+  "opt_125m_clm_woq": {
     "working_dir": "huggingface/pytorch/language-modeling/quantization",
     "tune": {
       "cmd": "bash run_tuning.sh",
       "params": {
-        "topology": "opt_125m_weight_only_awq",
+        "topology": "opt_125m_woq",
         "task": "clm",
         "approach": "weight_only",
         "output_model": "saved_results"
@@ -1630,7 +1630,7 @@
     "benchmark": {
       "cmd": "bash run_benchmark.sh",
       "params": {
-        "topology": "opt_125m_weight_only_awq",
+        "topology": "opt_125m_woq",
         "task": "clm",
         "mode": "accuracy",
         "batch_size": "112",
@@ -1640,12 +1640,12 @@
       }
     }
   },
-  "chatglm_clm_weight_only": {
+  "opt_125m_clm_woq_awq": {
     "working_dir": "huggingface/pytorch/language-modeling/quantization",
     "tune": {
       "cmd": "bash run_tuning.sh",
       "params": {
-        "topology": "chatglm_weight_only",
+        "topology": "opt_125m_woq_awq",
         "task": "clm",
         "approach": "weight_only",
         "output_model": "saved_results"
@@ -1654,7 +1654,7 @@
     "benchmark": {
       "cmd": "bash run_benchmark.sh",
       "params": {
-        "topology": "chatglm_weight_only",
+        "topology": "opt_125m_woq_awq",
         "task": "clm",
         "mode": "accuracy",
         "batch_size": "112",
@@ -1664,6 +1664,81 @@
       }
     }
   },
+  "opt_125m_clm_woq_gptq": {
+    "working_dir": "huggingface/pytorch/language-modeling/quantization",
+    "tune": {
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "opt_125m_woq_gptq",
+        "task": "clm",
+        "approach": "weight_only",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "opt_125m_woq_gptq",
+        "task": "clm",
+        "mode": "accuracy",
+        "batch_size": "112",
+        "config": "saved_results",
+        "iters": "100",
+        "int8": "false"
+      }
+    }
+  },
+  "opt_125m_clm_woq_teq": {
+    "working_dir": "huggingface/pytorch/language-modeling/quantization",
+    "tune": {
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "opt_125m_woq_teq",
+        "task": "clm",
+        "approach": "weight_only",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "opt_125m_woq_teq",
+        "task": "clm",
+        "mode": "accuracy",
+        "batch_size": "112",
+        "config": "saved_results",
+        "iters": "100",
+        "int8": "false"
+      }
+    }
+  },
+  "opt_125m_clm_ipex": {
+   "working_dir": "huggingface/pytorch/language-modeling/quantization",
+   "tune": {
+     "cmd": "bash run_tuning.sh",
+     "params": {
+       "topology": "opt_125m",
+       "task": "clm",
+       "approach": "static",
+       "backend": "ipex",
+       "output_model": "saved_results"
+     }
+   },
+   "benchmark": {
+     "cmd": "bash run_benchmark.sh",
+     "params": {
+       "topology": "opt_125m",
+       "task": "clm",
+       "approach": "static",
+       "backend": "ipex",
+       "mode": "accuracy",
+       "batch_size": "112",
+       "iters": "100",
+       "int8": "false",
+       "config": "saved_results"
+     }
+   }
+ },
    "opt_1.3b_clm_ipex": {
     "working_dir": "huggingface/pytorch/language-modeling/quantization",
     "tune": {

diff --git a/examples/huggingface/pytorch/language-modeling/quantization/README.md b/examples/huggingface/pytorch/language-modeling/quantization/README.md
@@ -48,14 +48,21 @@ python run_clm_no_trainer.py \
     --model EleutherAI/gpt-j-6B \
     --quantize \
     --approach weight_only \
-    --output_dir "saved_results" \
+    --woq_bits 4 \
+    --woq_group_size 128 \
+    --woq_scheme asym  \
+    --woq_algo RTN \
+    --woq_mse_range \
+    --output_dir "saved_results"
 ```
-**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN/AWQ[1]/GPTQ[2] algorithms. You can try it with `--approach weight_only`. `--awq` will trigger AWQ algorithm. `--gptq` will trigger GPTQ algorithm. For example, to run a GPTQ example, try the following command.
+**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md)
+
+
 ```bash
 python run_clm_no_trainer.py \
     --model EleutherAI/gpt-j-6B \
-    --weight_only_algo GPTQ \
-    --weight_only_bits 4 \
+    --woq_algo GPTQ \
+    --woq_bits 4 \
     --quantize \
     --pad_max_length 2048 \
     --gptq_pad_max_length 2048 \
@@ -242,5 +249,5 @@ python run_mlm.py \
     --overwrite_output_dir
 ```
 
-[1]. Lin, Ji, et al. "AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration." arXiv preprint arXiv:2306.00978 (2023).
-[2]. Elias, Frantar, et al. "GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers." arXiv preprint arXiv:2210.17323 (2023).
+[1]. Elias, Frantar, et al. "GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers." arXiv preprint arXiv:2210.17323 (2023).
+[2]. Lin, Ji, et al. "AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration." arXiv preprint arXiv:2306.00978 (2023).
diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_benchmark.sh b/examples/huggingface/pytorch/language-modeling/quantization/run_benchmark.sh
@@ -83,17 +83,17 @@ function run_benchmark {
             model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B"
             extra_cmd=$extra_cmd" --ipex"
         fi
-    elif [ "${topology}" = "gpt_j_weight_only" ]; then
+    elif [ "${topology}" = "gpt_j_woq" ]; then
         script="run_clm_no_trainer.py"
         model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B"
         lm_eval_tasks="lambada_openai"
         extra_cmd=$extra_cmd" --approach weight_only"
-   elif [ "${topology}" = "chatglm_weight_only" ]; then
+   elif [ "${topology}" = "chatglm_woq" ]; then
         script="run_clm_no_trainer.py"
         model_name_or_path="THUDM/chatglm-6b"
         lm_eval_tasks="lambada_openai"
         extra_cmd=$extra_cmd" --approach weight_only"
-    elif [ "${topology}" = "gpt_j_weight_only_awq" ]; then
+    elif [ "${topology}" = "gpt_j_woq_awq" ]; then
         script="run_clm_no_trainer.py"
         model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B"
         lm_eval_tasks="lambada_openai"
@@ -107,16 +107,20 @@ function run_benchmark {
     elif [ "${topology}" = "falcon_7b_instruct" ]; then
         script="run_clm_no_trainer.py"
         model_name_or_path="tiiuae/falcon-7b-instruct"
-    elif [ "${topology}" = "opt_125m_weight_only" ]; then
+    elif [ "${topology}" = "opt_125m_woq"  -o \
+           "${topology}" = "opt_125m_woq_awq"  -o \
+           "${topology}" = "opt_125m_woq_gptq"  -o \
+           "${topology}" = "opt_125m_woq_teq" ]; then
         script="run_clm_no_trainer.py"
         model_name_or_path="facebook/opt-125m"
         lm_eval_tasks="lambada_openai"
         extra_cmd=$extra_cmd" --approach weight_only"
-    elif [ "${topology}" = "opt_125m_weight_only_awq" ]; then
+    elif [ "${topology}" = "opt_125m" ]; then
         script="run_clm_no_trainer.py"
         model_name_or_path="facebook/opt-125m"
-        lm_eval_tasks="lambada_openai"
-        extra_cmd=$extra_cmd" --approach weight_only"
+        if [ "${backend}" = "ipex" ]; then
+            extra_cmd=$extra_cmd" --ipex"
+        fi
     elif [ "${topology}" = "opt_1.3b" ]; then
         script="run_clm_no_trainer.py"
         model_name_or_path="facebook/opt-1.3b"