From 56f8064fc4c619261aebf1bfde4f182d09a7fa64 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Thu, 31 Aug 2023 16:21:04 +0800
Subject: [PATCH 1/4] refine example for SQ and WOQ

Signed-off-by: Xin He <xin3.he@intel.com>
---
 examples/.config/pytorch_optimize.json        |  81 ++++++++++-
 .../language-modeling/quantization/README.md  |  15 ++-
 .../quantization/run_benchmark.sh             |  12 +-
 .../quantization/run_clm_no_trainer.py        | 127 ++++++++----------
 .../quantization/run_tuning.sh                |  25 +++-
 5 files changed, 177 insertions(+), 83 deletions(-)

diff --git a/examples/.config/pytorch_optimize.json b/examples/.config/pytorch_optimize.json
index f7e34838053..4ee6ce1ef1d 100644
--- a/examples/.config/pytorch_optimize.json
+++ b/examples/.config/pytorch_optimize.json
@@ -1592,6 +1592,30 @@
       }
     }
   },
+  "chatglm_clm_weight_only": {
+    "working_dir": "huggingface/pytorch/language-modeling/quantization",
+    "tune": {
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "chatglm_weight_only",
+        "task": "clm",
+        "approach": "weight_only",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "chatglm_weight_only",
+        "task": "clm",
+        "mode": "accuracy",
+        "batch_size": "112",
+        "config": "saved_results",
+        "iters": "100",
+        "int8": "false"
+      }
+    }
+  },
   "opt_125m_clm_weight_only": {
     "working_dir": "huggingface/pytorch/language-modeling/quantization",
     "tune": {
@@ -1640,12 +1664,12 @@
       }
     }
   },
-  "chatglm_clm_weight_only": {
+  "opt_125m_clm_weight_only_gptq": {
     "working_dir": "huggingface/pytorch/language-modeling/quantization",
     "tune": {
       "cmd": "bash run_tuning.sh",
       "params": {
-        "topology": "chatglm_weight_only",
+        "topology": "opt_125m_weight_only_gptq",
         "task": "clm",
         "approach": "weight_only",
         "output_model": "saved_results"
@@ -1654,7 +1678,31 @@
     "benchmark": {
       "cmd": "bash run_benchmark.sh",
       "params": {
-        "topology": "chatglm_weight_only",
+        "topology": "opt_125m_weight_only_gptq",
+        "task": "clm",
+        "mode": "accuracy",
+        "batch_size": "112",
+        "config": "saved_results",
+        "iters": "100",
+        "int8": "false"
+      }
+    }
+  },
+  "opt_125m_clm_weight_only_teq": {
+    "working_dir": "huggingface/pytorch/language-modeling/quantization",
+    "tune": {
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "opt_125m_weight_only_teq",
+        "task": "clm",
+        "approach": "weight_only",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "opt_125m_weight_only_teq",
         "task": "clm",
         "mode": "accuracy",
         "batch_size": "112",
@@ -1664,6 +1712,33 @@
       }
     }
   },
+  "opt_125m_clm_ipex": {
+   "working_dir": "huggingface/pytorch/language-modeling/quantization",
+   "tune": {
+     "cmd": "bash run_tuning.sh",
+     "params": {
+       "topology": "opt_125m",
+       "task": "clm",
+       "approach": "static",
+       "backend": "ipex",
+       "output_model": "saved_results"
+     }
+   },
+   "benchmark": {
+     "cmd": "bash run_benchmark.sh",
+     "params": {
+       "topology": "opt_125m",
+       "task": "clm",
+       "approach": "static",
+       "backend": "ipex",
+       "mode": "accuracy",
+       "batch_size": "112",
+       "iters": "100",
+       "int8": "false",
+       "config": "saved_results"
+     }
+   }
+ },
    "opt_1.3b_clm_ipex": {
     "working_dir": "huggingface/pytorch/language-modeling/quantization",
     "tune": {
diff --git a/examples/huggingface/pytorch/language-modeling/quantization/README.md b/examples/huggingface/pytorch/language-modeling/quantization/README.md
index a016b29077a..52629e0afc5 100644
--- a/examples/huggingface/pytorch/language-modeling/quantization/README.md
+++ b/examples/huggingface/pytorch/language-modeling/quantization/README.md
@@ -48,9 +48,16 @@ python run_clm_no_trainer.py \
     --model EleutherAI/gpt-j-6B \
     --quantize \
     --approach weight_only \
-    --output_dir "saved_results" \
+    --weight_only_bits 4 \
+    --weight_only_group 128 \
+    --weight_only_scheme asym  \
+    --weight_only_algo RTN \
+    --weight_only_mse_range \
+    --output_dir "saved_results"
 ```
-**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN/AWQ[1]/GPTQ[2] algorithms. You can try it with `--approach weight_only`. `--awq` will trigger AWQ algorithm. `--gptq` will trigger GPTQ algorithm. For example, to run a GPTQ example, try the following command.
+**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md)
+
+
 ```bash
 python run_clm_no_trainer.py \
     --model EleutherAI/gpt-j-6B \
@@ -242,5 +249,5 @@ python run_mlm.py \
     --overwrite_output_dir
 ```
 
-[1]. Lin, Ji, et al. "AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration." arXiv preprint arXiv:2306.00978 (2023).
-[2]. Elias, Frantar, et al. "GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers." arXiv preprint arXiv:2210.17323 (2023).
+[1]. Elias, Frantar, et al. "GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers." arXiv preprint arXiv:2210.17323 (2023).
+[2]. Lin, Ji, et al. "AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration." arXiv preprint arXiv:2306.00978 (2023).
diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_benchmark.sh b/examples/huggingface/pytorch/language-modeling/quantization/run_benchmark.sh
index e2702f0f0cd..9f9116ab3f9 100644
--- a/examples/huggingface/pytorch/language-modeling/quantization/run_benchmark.sh
+++ b/examples/huggingface/pytorch/language-modeling/quantization/run_benchmark.sh
@@ -107,16 +107,20 @@ function run_benchmark {
     elif [ "${topology}" = "falcon_7b_instruct" ]; then
         script="run_clm_no_trainer.py"
         model_name_or_path="tiiuae/falcon-7b-instruct"
-    elif [ "${topology}" = "opt_125m_weight_only" ]; then
+    elif [ "${topology}" = "opt_125m_weight_only"  -o \
+           "${topology}" = "opt_125m_weight_only_awq"  -o \
+           "${topology}" = "opt_125m_weight_only_gptq"  -o \
+           "${topology}" = "opt_125m_weight_only_teq" ]; then
         script="run_clm_no_trainer.py"
         model_name_or_path="facebook/opt-125m"
         lm_eval_tasks="lambada_openai"
         extra_cmd=$extra_cmd" --approach weight_only"
-    elif [ "${topology}" = "opt_125m_weight_only_awq" ]; then
+    elif [ "${topology}" = "opt_125m" ]; then
         script="run_clm_no_trainer.py"
         model_name_or_path="facebook/opt-125m"
-        lm_eval_tasks="lambada_openai"
-        extra_cmd=$extra_cmd" --approach weight_only"
+        if [ "${backend}" = "ipex" ]; then
+            extra_cmd=$extra_cmd" --ipex"
+        fi
     elif [ "${topology}" = "opt_1.3b" ]; then
         script="run_clm_no_trainer.py"
         model_name_or_path="facebook/opt-1.3b"
diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py b/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py
index bd0a96a1292..7e8cf77abfd 100644
--- a/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py
+++ b/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py
@@ -31,19 +31,6 @@
 )
 parser.add_argument("--approach", type=str, default='static', 
                     help="Select from ['dynamic', 'static', 'weight-only']")
-parser.add_argument("--sq", action="store_true")
-parser.add_argument("--alpha", default="auto",
-                    help="Smooth quant parameter.")
-# ============gptq configs===============
-parser.add_argument("--gptq_actorder", action="store_true", help="Whether to apply the activation order GPTQ heuristic.")
-parser.add_argument('--gptq_percdamp', type=float, default=.01, help='Percent of the average Hessian diagonal to use for dampening.')
-parser.add_argument('--gptq_block_size', type=int, default=128, help='Block size. sub weight matrix size to run GPTQ.')
-parser.add_argument('--gptq_nsamples', type=int, default=128, help='Number of calibration data samples.')
-parser.add_argument('--gptq_use_max_length', action="store_true", help='Set all sequence length to be same length of args.gptq_pad_max_length')
-parser.add_argument('--gptq_pad_max_length', type=int, default=2048, help='Calibration dataset sequence max length, this should align with your model config, and your dataset builder args: args.pad_max_length')
-# =======================================
-parser.add_argument("--weight_only_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ'], 
-                    help="Weight-only parameter.")
 parser.add_argument("--int8", action="store_true")
 parser.add_argument("--ipex", action="store_true", help="Use intel extension for pytorch.")
 parser.add_argument("--accuracy", action="store_true")
@@ -58,11 +45,28 @@
 parser.add_argument("--tasks", nargs='+', default=["lambada_openai",
     "hellaswag","winogrande","piqa","wikitext"],
     type=str, help="tasks list for accuracy validation")
+parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model")
+# ============SmoothQuant configs==============
+parser.add_argument("--sq", action="store_true")
+parser.add_argument("--alpha", default="auto", help="Smooth quant parameter.")
+# ============WeightOnly configs===============
+parser.add_argument("--weight_only_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ'], 
+                    help="Weight-only parameter.")
 parser.add_argument("--weight_only_bits", type=int, default=8)
 parser.add_argument("--weight_only_group", type=int, default=-1)
 parser.add_argument("--weight_only_scheme", default="sym")
+parser.add_argument("--weight_only_mse_range", action="store_true")
 parser.add_argument("--weight_only_sym_full_range", action="store_true")
-parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model")
+# =============GPTQ configs====================
+parser.add_argument("--gptq_actorder", action="store_true", help="Whether to apply the activation order GPTQ heuristic.")
+parser.add_argument('--gptq_percdamp', type=float, default=.01, help='Percent of the average Hessian diagonal to use for dampening.')
+parser.add_argument('--gptq_block_size', type=int, default=128, help='Block size. sub weight matrix size to run GPTQ.')
+parser.add_argument('--gptq_nsamples', type=int, default=128, help='Number of calibration data samples.')
+parser.add_argument('--gptq_use_max_length', action="store_true", help='Set all sequence length to be same length of args.gptq_pad_max_length')
+parser.add_argument('--gptq_pad_max_length', type=int, default=2048, help='Calibration dataset sequence max length, \
+                                                                           this should align with your model config, \
+                                                                           and your dataset builder args: args.pad_max_length')
+# =======================================
 
 args = parser.parse_args()
 if args.ipex:
@@ -231,7 +235,9 @@ def calib_func(prepared_model):
             prepared_model(calib_input[0])
 
     recipes = {}
+    eval_func = None
     from neural_compressor import PostTrainingQuantConfig, quantization
+    # specify the op_type_dict and op_name_dict
     if args.approach == 'weight_only':
         op_type_dict = {
             '.*':{ 	# re.match
@@ -243,55 +249,41 @@ def calib_func(prepared_model):
                 },
             },
         }
-        if args.weight_only_sym_full_range:
-            recipes.update({"rtn_args": {"sym_full_range": True}})
-    else:
-        if re.search("gpt", user_model.config.model_type):
-            op_type_dict = {
-                "add": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}},
-            }
-        else:
-            op_type_dict = {}
-    excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"]
-    if args.sq:
-        # alpha can be a float number of a list of float number.
-        args.alpha = args.alpha if args.alpha == "auto" else eval(args.alpha)
-        if re.search("falcon", user_model.config.model_type):
-            recipes = {"smooth_quant": True, "smooth_quant_args": {'alpha': args.alpha, 'folding': False}}
-        else:
-            recipes = {"smooth_quant": True, "smooth_quant_args": {'alpha': args.alpha}}
-        conf = PostTrainingQuantConfig(
-            backend="ipex" if args.ipex else "default",
-            approach=args.approach,
-            excluded_precisions=excluded_precisions,
-            op_type_dict=op_type_dict,
-            recipes=recipes,
-        )
-    elif args.weight_only_algo == "GPTQ":
-        recipes = {
-            'gptq_args': {
-                'percdamp': args.gptq_percdamp, 
-                'act_order':args.gptq_actorder, 
-                'block_size': args.gptq_block_size, 
-                'nsamples': args.gptq_nsamples, 
-                'use_max_length': args.gptq_use_max_length
-            }
+        op_name_dict={
+            'lm_head':{"weight": {'dtype': 'fp32'},},
+            'embed_out':{"weight": {'dtype': 'fp32'},},  # for dolly_v2
         }
+        recipes["rtn_args"] = {
+            "mse_range": args.weight_only_mse_range,
+            "sym_full_range": args.weight_only_sym_full_range,
+        }
+        # GPTQ: use assistive functions to modify calib_dataloader and calib_func
+        # TEQ: set calib_func=None, use default training func as calib_func
+        if args.weight_only_algo in ["GPTQ", "TEQ"]:
+            calib_func = None
+
         conf = PostTrainingQuantConfig(
-            backend="ipex" if args.ipex else "default",
             approach=args.approach,
-            excluded_precisions=excluded_precisions,
             op_type_dict=op_type_dict,
-            op_name_dict={
-                '.*lm_head':{ 	# re.match
-                    "weight": {
-                        'dtype': 'fp32'
-                    },
-                },
-            },
+            op_name_dict=op_name_dict,
             recipes=recipes,
         )
     else:
+        if re.search("gpt", user_model.config.model_type):
+            op_type_dict = {
+                "add": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}},
+            }
+        else:
+            op_type_dict = {}
+        excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"]
+        if args.sq:
+            # alpha can be a float number of a list of float number.
+            args.alpha = args.alpha if args.alpha == "auto" else eval(args.alpha)
+            if re.search("falcon", user_model.config.model_type):
+                recipes = {"smooth_quant": True, "smooth_quant_args": {'alpha': args.alpha, 'folding': False}}
+            else:
+                recipes = {"smooth_quant": True, "smooth_quant_args": {'alpha': args.alpha}}
+
         conf = PostTrainingQuantConfig(
             backend="ipex" if args.ipex else "default",
             approach=args.approach,
@@ -300,20 +292,13 @@ def calib_func(prepared_model):
             recipes=recipes,
         )
 
-    # when GPTQ is enabled: use assistive functions to modify calib_dataloader and calib_func
-    if args.weight_only_algo == "GPTQ":
-        calib_func = None
-    if args.weight_only_algo == 'TEQ':
-        # set calib_func=None, use default training func as calib_func
-        calib_func = None
-
-    eval_dataset = load_dataset('lambada', split='validation')
-    evaluator = Evaluator(eval_dataset, tokenizer)
-    def eval_func(model):
-        acc = evaluator.evaluate(model)
-        return acc
-    # eval_func should be set when tuning alpha.
-    eval_func = eval_func if isinstance(args.alpha, list) else None
+        # eval_func should be set when tuning alpha.
+        if isinstance(args.alpha, list):
+            eval_dataset = load_dataset('lambada', split='validation')
+            evaluator = Evaluator(eval_dataset, tokenizer)
+            def eval_func(model):
+                acc = evaluator.evaluate(model)
+                return acc
 
     q_model = quantization.fit(
         user_model,
@@ -339,7 +324,7 @@ def eval_func(model):
 
 if args.accuracy:
     user_model.eval()
-    from intel_extension_for_transformers.evaluation.lm_eval import evaluate
+    from intel_extension_for_transformers.llm.evaluation.lm_eval import evaluate
     results = evaluate(
         model="hf-causal",
         model_args='pretrained='+args.model+',tokenizer='+args.model+',dtype=float32',
diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh b/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh
index e363363cafe..003996a6913 100644
--- a/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh
+++ b/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh
@@ -145,13 +145,36 @@ function run_tuning {
         DATASET_NAME="NeelNanda/pile-10k"
         model_name_or_path="facebook/opt-125m"
         approach="weight_only"
-        extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo RTN"
+        extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo RTN --weight_only_mse_range"
     elif [ "${topology}" = "opt_125m_weight_only_awq" ]; then
         script="run_clm_no_trainer.py"
         DATASET_NAME="NeelNanda/pile-10k"
         model_name_or_path="facebook/opt-125m"
         approach="weight_only"
         extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo AWQ --calib_iters 128"
+    elif [ "${topology}" = "opt_125m_weight_only_gptq" ]; then
+        script="run_clm_no_trainer.py"
+        DATASET_NAME="NeelNanda/pile-10k"
+        model_name_or_path="facebook/opt-125m"
+        approach="weight_only"
+        extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo GPTQ"
+    elif [ "${topology}" = "opt_125m_weight_only_teq" ]; then
+        script="run_clm_no_trainer.py"
+        DATASET_NAME="NeelNanda/pile-10k"
+        model_name_or_path="facebook/opt-125m"
+        approach="weight_only"
+        extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo TEQ"
+    elif [ "${topology}" = "opt_125m" ]; then
+        if [ "${backend}" = "ipex" ]; then
+            extra_cmd=$extra_cmd" --ipex"
+        fi
+        script="run_clm_no_trainer.py"
+        DATASET_NAME="NeelNanda/pile-10k"
+        model_name_or_path="facebook/opt-125m"
+        approach="PostTrainingStatic"
+        extra_cmd=$extra_cmd" --int8_bf16_mixed"
+	    alpha=0.8
+        extra_cmd=$extra_cmd" --sq --alpha "${alpha}
     elif [ "${topology}" = "opt_1.3b" ]; then
         if [ "${backend}" = "ipex" ]; then
             extra_cmd=$extra_cmd" --ipex"

From 272b8da360bdc16c32c1f1cee86c3b9208d8a954 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Fri, 1 Sep 2023 09:11:48 +0800
Subject: [PATCH 2/4] add gptq args

Signed-off-by: Xin He <xin3.he@intel.com>
---
 .../language-modeling/quantization/run_clm_no_trainer.py   | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py b/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py
index 7e8cf77abfd..43520a189ed 100644
--- a/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py
+++ b/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py
@@ -257,6 +257,13 @@ def calib_func(prepared_model):
             "mse_range": args.weight_only_mse_range,
             "sym_full_range": args.weight_only_sym_full_range,
         }
+        recipes['gptq_args'] = {
+                'percdamp': args.gptq_percdamp, 
+                'act_order':args.gptq_actorder, 
+                'block_size': args.gptq_block_size, 
+                'nsamples': args.gptq_nsamples, 
+                'use_max_length': args.gptq_use_max_length
+            }
         # GPTQ: use assistive functions to modify calib_dataloader and calib_func
         # TEQ: set calib_func=None, use default training func as calib_func
         if args.weight_only_algo in ["GPTQ", "TEQ"]:

From e5b3bffdbb3ff6b2864bc7e3dfd96a007508b831 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Sun, 3 Sep 2023 16:58:18 +0800
Subject: [PATCH 3/4] use woq as args name

Signed-off-by: Xin He <xin3.he@intel.com>
---
 examples/.config/pytorch_optimize.json        | 42 +++++++++----------
 .../quantization/run_benchmark.sh             | 14 +++----
 .../quantization/run_clm_no_trainer.py        | 32 +++++++-------
 .../quantization/run_tuning.sh                | 26 ++++++------
 4 files changed, 57 insertions(+), 57 deletions(-)

diff --git a/examples/.config/pytorch_optimize.json b/examples/.config/pytorch_optimize.json
index 4ee6ce1ef1d..8d1b8c627c1 100644
--- a/examples/.config/pytorch_optimize.json
+++ b/examples/.config/pytorch_optimize.json
@@ -1465,12 +1465,12 @@
       }
     }
   },
-  "gpt_j_6b_clm_weight_only": {
+  "gpt_j_6b_clm_woq": {
     "working_dir": "huggingface/pytorch/language-modeling/quantization",
     "tune": {
       "cmd": "bash run_tuning.sh",
       "params": {
-        "topology": "gpt_j_weight_only",
+        "topology": "gpt_j_woq",
         "task": "clm",
         "approach": "weight_only",
         "output_model": "saved_results"
@@ -1479,7 +1479,7 @@
     "benchmark": {
       "cmd": "bash run_benchmark.sh",
       "params": {
-        "topology": "gpt_j_weight_only",
+        "topology": "gpt_j_woq",
         "task": "clm",
         "mode": "accuracy",
         "batch_size": "112",
@@ -1489,12 +1489,12 @@
       }
     }
   },
-  "gpt_j_6b_clm_weight_only_awq": {
+  "gpt_j_6b_clm_woq_awq": {
     "working_dir": "huggingface/pytorch/language-modeling/quantization",
     "tune": {
       "cmd": "bash run_tuning.sh",
       "params": {
-        "topology": "gpt_j_weight_only_awq",
+        "topology": "gpt_j_woq_awq",
         "task": "clm",
         "approach": "weight_only",
         "output_model": "saved_results"
@@ -1503,7 +1503,7 @@
     "benchmark": {
       "cmd": "bash run_benchmark.sh",
       "params": {
-        "topology": "gpt_j_weight_only_awq",
+        "topology": "gpt_j_woq_awq",
         "task": "clm",
         "mode": "accuracy",
         "batch_size": "112",
@@ -1592,12 +1592,12 @@
       }
     }
   },
-  "chatglm_clm_weight_only": {
+  "chatglm_clm_woq": {
     "working_dir": "huggingface/pytorch/language-modeling/quantization",
     "tune": {
       "cmd": "bash run_tuning.sh",
       "params": {
-        "topology": "chatglm_weight_only",
+        "topology": "chatglm_woq",
         "task": "clm",
         "approach": "weight_only",
         "output_model": "saved_results"
@@ -1606,7 +1606,7 @@
     "benchmark": {
       "cmd": "bash run_benchmark.sh",
       "params": {
-        "topology": "chatglm_weight_only",
+        "topology": "chatglm_woq",
         "task": "clm",
         "mode": "accuracy",
         "batch_size": "112",
@@ -1616,12 +1616,12 @@
       }
     }
   },
-  "opt_125m_clm_weight_only": {
+  "opt_125m_clm_woq": {
     "working_dir": "huggingface/pytorch/language-modeling/quantization",
     "tune": {
       "cmd": "bash run_tuning.sh",
       "params": {
-        "topology": "opt_125m_weight_only",
+        "topology": "opt_125m_woq",
         "task": "clm",
         "approach": "weight_only",
         "output_model": "saved_results"
@@ -1630,7 +1630,7 @@
     "benchmark": {
       "cmd": "bash run_benchmark.sh",
       "params": {
-        "topology": "opt_125m_weight_only",
+        "topology": "opt_125m_woq",
         "task": "clm",
         "mode": "accuracy",
         "batch_size": "112",
@@ -1640,12 +1640,12 @@
       }
     }
   },
-  "opt_125m_clm_weight_only_awq": {
+  "opt_125m_clm_woq_awq": {
     "working_dir": "huggingface/pytorch/language-modeling/quantization",
     "tune": {
       "cmd": "bash run_tuning.sh",
       "params": {
-        "topology": "opt_125m_weight_only_awq",
+        "topology": "opt_125m_woq_awq",
         "task": "clm",
         "approach": "weight_only",
         "output_model": "saved_results"
@@ -1654,7 +1654,7 @@
     "benchmark": {
       "cmd": "bash run_benchmark.sh",
       "params": {
-        "topology": "opt_125m_weight_only_awq",
+        "topology": "opt_125m_woq_awq",
         "task": "clm",
         "mode": "accuracy",
         "batch_size": "112",
@@ -1664,12 +1664,12 @@
       }
     }
   },
-  "opt_125m_clm_weight_only_gptq": {
+  "opt_125m_clm_woq_gptq": {
     "working_dir": "huggingface/pytorch/language-modeling/quantization",
     "tune": {
       "cmd": "bash run_tuning.sh",
       "params": {
-        "topology": "opt_125m_weight_only_gptq",
+        "topology": "opt_125m_woq_gptq",
         "task": "clm",
         "approach": "weight_only",
         "output_model": "saved_results"
@@ -1678,7 +1678,7 @@
     "benchmark": {
       "cmd": "bash run_benchmark.sh",
       "params": {
-        "topology": "opt_125m_weight_only_gptq",
+        "topology": "opt_125m_woq_gptq",
         "task": "clm",
         "mode": "accuracy",
         "batch_size": "112",
@@ -1688,12 +1688,12 @@
       }
     }
   },
-  "opt_125m_clm_weight_only_teq": {
+  "opt_125m_clm_woq_teq": {
     "working_dir": "huggingface/pytorch/language-modeling/quantization",
     "tune": {
       "cmd": "bash run_tuning.sh",
       "params": {
-        "topology": "opt_125m_weight_only_teq",
+        "topology": "opt_125m_woq_teq",
         "task": "clm",
         "approach": "weight_only",
         "output_model": "saved_results"
@@ -1702,7 +1702,7 @@
     "benchmark": {
       "cmd": "bash run_benchmark.sh",
       "params": {
-        "topology": "opt_125m_weight_only_teq",
+        "topology": "opt_125m_woq_teq",
         "task": "clm",
         "mode": "accuracy",
         "batch_size": "112",
diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_benchmark.sh b/examples/huggingface/pytorch/language-modeling/quantization/run_benchmark.sh
index 9f9116ab3f9..24912b52f5e 100644
--- a/examples/huggingface/pytorch/language-modeling/quantization/run_benchmark.sh
+++ b/examples/huggingface/pytorch/language-modeling/quantization/run_benchmark.sh
@@ -83,17 +83,17 @@ function run_benchmark {
             model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B"
             extra_cmd=$extra_cmd" --ipex"
         fi
-    elif [ "${topology}" = "gpt_j_weight_only" ]; then
+    elif [ "${topology}" = "gpt_j_woq" ]; then
         script="run_clm_no_trainer.py"
         model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B"
         lm_eval_tasks="lambada_openai"
         extra_cmd=$extra_cmd" --approach weight_only"
-   elif [ "${topology}" = "chatglm_weight_only" ]; then
+   elif [ "${topology}" = "chatglm_woq" ]; then
         script="run_clm_no_trainer.py"
         model_name_or_path="THUDM/chatglm-6b"
         lm_eval_tasks="lambada_openai"
         extra_cmd=$extra_cmd" --approach weight_only"
-    elif [ "${topology}" = "gpt_j_weight_only_awq" ]; then
+    elif [ "${topology}" = "gpt_j_woq_awq" ]; then
         script="run_clm_no_trainer.py"
         model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B"
         lm_eval_tasks="lambada_openai"
@@ -107,10 +107,10 @@ function run_benchmark {
     elif [ "${topology}" = "falcon_7b_instruct" ]; then
         script="run_clm_no_trainer.py"
         model_name_or_path="tiiuae/falcon-7b-instruct"
-    elif [ "${topology}" = "opt_125m_weight_only"  -o \
-           "${topology}" = "opt_125m_weight_only_awq"  -o \
-           "${topology}" = "opt_125m_weight_only_gptq"  -o \
-           "${topology}" = "opt_125m_weight_only_teq" ]; then
+    elif [ "${topology}" = "opt_125m_woq"  -o \
+           "${topology}" = "opt_125m_woq_awq"  -o \
+           "${topology}" = "opt_125m_woq_gptq"  -o \
+           "${topology}" = "opt_125m_woq_teq" ]; then
         script="run_clm_no_trainer.py"
         model_name_or_path="facebook/opt-125m"
         lm_eval_tasks="lambada_openai"
diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py b/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py
index 43520a189ed..89cba83383c 100644
--- a/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py
+++ b/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py
@@ -50,13 +50,13 @@
 parser.add_argument("--sq", action="store_true")
 parser.add_argument("--alpha", default="auto", help="Smooth quant parameter.")
 # ============WeightOnly configs===============
-parser.add_argument("--weight_only_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ'], 
+parser.add_argument("--woq_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ'], 
                     help="Weight-only parameter.")
-parser.add_argument("--weight_only_bits", type=int, default=8)
-parser.add_argument("--weight_only_group", type=int, default=-1)
-parser.add_argument("--weight_only_scheme", default="sym")
-parser.add_argument("--weight_only_mse_range", action="store_true")
-parser.add_argument("--weight_only_sym_full_range", action="store_true")
+parser.add_argument("--woq_bits", type=int, default=8)
+parser.add_argument("--woq_group_size", type=int, default=-1)
+parser.add_argument("--woq_scheme", default="sym")
+parser.add_argument("--woq_mse_range", action="store_true")
+parser.add_argument("--woq_sym_full_range", action="store_true")
 # =============GPTQ configs====================
 parser.add_argument("--gptq_actorder", action="store_true", help="Whether to apply the activation order GPTQ heuristic.")
 parser.add_argument('--gptq_percdamp', type=float, default=.01, help='Percent of the average Hessian diagonal to use for dampening.')
@@ -88,7 +88,7 @@ def __init__(self, dataset, tokenizer, batch_size=8, pad_val=1, pad_max=196, is_
 
     @torch.no_grad()
     def tokenize_function(self, examples):
-        if args.weight_only_algo in ['TEQ']:
+        if args.woq_algo in ['TEQ']:
             if self.tokenizer.pad_token is None:
                 self.tokenizer.pad_token = self.tokenizer.eos_token
             example = self.tokenizer(examples["text"], padding="max_length", max_length=self.pad_max)
@@ -153,7 +153,7 @@ def evaluate(self, model):
 def get_user_model():
     from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer
     torchscript = False
-    if args.sq or args.weight_only_algo in ['AWQ', 'TEQ']:
+    if args.sq or args.woq_algo in ['AWQ', 'TEQ']:
         torchscript = True
     if re.search("llama", args.model.lower()):
         import transformers
@@ -202,7 +202,7 @@ def get_user_model():
         tokenizer = AutoTokenizer.from_pretrained(args.model)
 
     # Set model's seq_len when GPTQ calibration is enabled.
-    if args.weight_only_algo == 'GPTQ':
+    if args.woq_algo == 'GPTQ':
         user_model.seqlen = args.gptq_pad_max_length
 
     if args.peft_model_id is not None:
@@ -242,10 +242,10 @@ def calib_func(prepared_model):
         op_type_dict = {
             '.*':{ 	# re.match
                 "weight": {
-                    'bits': args.weight_only_bits, # 1-8 bits 
-                    'group_size': args.weight_only_group,  # -1 (per-channel)
-                    'scheme': args.weight_only_scheme, # sym/asym
-                    'algorithm': args.weight_only_algo, # RTN/AWQ/TEQ
+                    'bits': args.woq_bits, # 1-8 bits 
+                    'group_size': args.woq_group_size,  # -1 (per-channel)
+                    'scheme': args.woq_scheme, # sym/asym
+                    'algorithm': args.woq_algo, # RTN/AWQ/TEQ
                 },
             },
         }
@@ -254,8 +254,8 @@ def calib_func(prepared_model):
             'embed_out':{"weight": {'dtype': 'fp32'},},  # for dolly_v2
         }
         recipes["rtn_args"] = {
-            "mse_range": args.weight_only_mse_range,
-            "sym_full_range": args.weight_only_sym_full_range,
+            "mse_range": args.woq_mse_range,
+            "sym_full_range": args.woq_sym_full_range,
         }
         recipes['gptq_args'] = {
                 'percdamp': args.gptq_percdamp, 
@@ -266,7 +266,7 @@ def calib_func(prepared_model):
             }
         # GPTQ: use assistive functions to modify calib_dataloader and calib_func
         # TEQ: set calib_func=None, use default training func as calib_func
-        if args.weight_only_algo in ["GPTQ", "TEQ"]:
+        if args.woq_algo in ["GPTQ", "TEQ"]:
             calib_func = None
 
         conf = PostTrainingQuantConfig(
diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh b/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh
index 003996a6913..250d31780be 100644
--- a/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh
+++ b/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh
@@ -106,24 +106,24 @@ function run_tuning {
 		extra_cmd=$extra_cmd" --int8_bf16_mixed"
         extra_cmd=$extra_cmd" --sq --alpha "${alpha}
         fi
-    elif [ "${topology}" = "gpt_j_weight_only" ]; then
+    elif [ "${topology}" = "gpt_j_woq" ]; then
         script="run_clm_no_trainer.py"
         DATASET_NAME="NeelNanda/pile-10k"
         model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B"
         approach="weight_only"
         extra_cmd=$extra_cmd" --approach weight_only"
-    elif [ "${topology}" = "chatglm_weight_only" ]; then
+    elif [ "${topology}" = "chatglm_woq" ]; then
         script="run_clm_no_trainer.py"
         DATASET_NAME="NeelNanda/pile-10k"
         model_name_or_path="THUDM/chatglm-6b"
         approach="weight_only"
-        extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo RTN"
-    elif [ "${topology}" = "gpt_j_weight_only_awq" ]; then
+        extra_cmd=$extra_cmd" --approach weight_only --woq_algo RTN"
+    elif [ "${topology}" = "gpt_j_woq_awq" ]; then
         script="run_clm_no_trainer.py"
         DATASET_NAME="NeelNanda/pile-10k"
         model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B"
         approach="weight_only"
-        extra_cmd=$extra_cmd" --approach weight_only  --weight_only_algo AWQ --calib_iters 128"
+        extra_cmd=$extra_cmd" --approach weight_only  --woq_algo AWQ --calib_iters 128"
     elif [ "${topology}" = "mpt_7b_chat" ]; then
 	if [ "${backend}" = "ipex" ]; then
             extra_cmd=$extra_cmd" --ipex"
@@ -140,30 +140,30 @@ function run_tuning {
         approach="PostTrainingStatic"
         alpha=0.7
         extra_cmd=$extra_cmd" --sq --alpha "${alpha}
-    elif [ "${topology}" = "opt_125m_weight_only" ]; then
+    elif [ "${topology}" = "opt_125m_woq" ]; then
         script="run_clm_no_trainer.py"
         DATASET_NAME="NeelNanda/pile-10k"
         model_name_or_path="facebook/opt-125m"
         approach="weight_only"
-        extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo RTN --weight_only_mse_range"
-    elif [ "${topology}" = "opt_125m_weight_only_awq" ]; then
+        extra_cmd=$extra_cmd" --approach weight_only --woq_algo RTN --woq_mse_range"
+    elif [ "${topology}" = "opt_125m_woq_awq" ]; then
         script="run_clm_no_trainer.py"
         DATASET_NAME="NeelNanda/pile-10k"
         model_name_or_path="facebook/opt-125m"
         approach="weight_only"
-        extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo AWQ --calib_iters 128"
-    elif [ "${topology}" = "opt_125m_weight_only_gptq" ]; then
+        extra_cmd=$extra_cmd" --approach weight_only --woq_algo AWQ --calib_iters 128"
+    elif [ "${topology}" = "opt_125m_woq_gptq" ]; then
         script="run_clm_no_trainer.py"
         DATASET_NAME="NeelNanda/pile-10k"
         model_name_or_path="facebook/opt-125m"
         approach="weight_only"
-        extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo GPTQ"
-    elif [ "${topology}" = "opt_125m_weight_only_teq" ]; then
+        extra_cmd=$extra_cmd" --approach weight_only --woq_algo GPTQ"
+    elif [ "${topology}" = "opt_125m_woq_teq" ]; then
         script="run_clm_no_trainer.py"
         DATASET_NAME="NeelNanda/pile-10k"
         model_name_or_path="facebook/opt-125m"
         approach="weight_only"
-        extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo TEQ"
+        extra_cmd=$extra_cmd" --approach weight_only --woq_algo TEQ"
     elif [ "${topology}" = "opt_125m" ]; then
         if [ "${backend}" = "ipex" ]; then
             extra_cmd=$extra_cmd" --ipex"

From 626c4a17023be35a5cfb5bc3c5aae95b20eb9959 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Mon, 4 Sep 2023 10:40:53 +0800
Subject: [PATCH 4/4] change readme

Signed-off-by: Xin He <xin3.he@intel.com>
---
 .../language-modeling/quantization/README.md       | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/huggingface/pytorch/language-modeling/quantization/README.md b/examples/huggingface/pytorch/language-modeling/quantization/README.md
index 52629e0afc5..cd6a05cd97e 100644
--- a/examples/huggingface/pytorch/language-modeling/quantization/README.md
+++ b/examples/huggingface/pytorch/language-modeling/quantization/README.md
@@ -48,11 +48,11 @@ python run_clm_no_trainer.py \
     --model EleutherAI/gpt-j-6B \
     --quantize \
     --approach weight_only \
-    --weight_only_bits 4 \
-    --weight_only_group 128 \
-    --weight_only_scheme asym  \
-    --weight_only_algo RTN \
-    --weight_only_mse_range \
+    --woq_bits 4 \
+    --woq_group_size 128 \
+    --woq_scheme asym  \
+    --woq_algo RTN \
+    --woq_mse_range \
     --output_dir "saved_results"
 ```
 **Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md)
@@ -61,8 +61,8 @@ python run_clm_no_trainer.py \
 ```bash
 python run_clm_no_trainer.py \
     --model EleutherAI/gpt-j-6B \
-    --weight_only_algo GPTQ \
-    --weight_only_bits 4 \
+    --woq_algo GPTQ \
+    --woq_bits 4 \
     --quantize \
     --pad_max_length 2048 \
     --gptq_pad_max_length 2048 \