intel · xin3he · Sep 8, 2023 · Aug 31, 2023 · Sep 1, 2023 · Sep 3, 2023
diff --git a/examples/.config/pytorch_optimize.json b/examples/.config/pytorch_optimize.json
@@ -1592,6 +1592,30 @@
       }
     }
   },
+  "chatglm_clm_weight_only": {
+    "working_dir": "huggingface/pytorch/language-modeling/quantization",
+    "tune": {
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "chatglm_weight_only",
+        "task": "clm",
+        "approach": "weight_only",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "chatglm_weight_only",
+        "task": "clm",
+        "mode": "accuracy",
+        "batch_size": "112",
+        "config": "saved_results",
+        "iters": "100",
+        "int8": "false"
+      }
+    }
+  },
   "opt_125m_clm_weight_only": {
     "working_dir": "huggingface/pytorch/language-modeling/quantization",
     "tune": {
@@ -1640,12 +1664,12 @@
       }
     }
   },
-  "chatglm_clm_weight_only": {
+  "opt_125m_clm_weight_only_gptq": {
     "working_dir": "huggingface/pytorch/language-modeling/quantization",
     "tune": {
       "cmd": "bash run_tuning.sh",
       "params": {
-        "topology": "chatglm_weight_only",
+        "topology": "opt_125m_weight_only_gptq",
         "task": "clm",
         "approach": "weight_only",
         "output_model": "saved_results"
@@ -1654,7 +1678,31 @@
     "benchmark": {
       "cmd": "bash run_benchmark.sh",
       "params": {
-        "topology": "chatglm_weight_only",
+        "topology": "opt_125m_weight_only_gptq",
+        "task": "clm",
+        "mode": "accuracy",
+        "batch_size": "112",
+        "config": "saved_results",
+        "iters": "100",
+        "int8": "false"
+      }
+    }
+  },
+  "opt_125m_clm_weight_only_teq": {
+    "working_dir": "huggingface/pytorch/language-modeling/quantization",
+    "tune": {
+      "cmd": "bash run_tuning.sh",
+      "params": {
+        "topology": "opt_125m_weight_only_teq",
+        "task": "clm",
+        "approach": "weight_only",
+        "output_model": "saved_results"
+      }
+    },
+    "benchmark": {
+      "cmd": "bash run_benchmark.sh",
+      "params": {
+        "topology": "opt_125m_weight_only_teq",
         "task": "clm",
         "mode": "accuracy",
         "batch_size": "112",
@@ -1664,6 +1712,33 @@
       }
     }
   },
+  "opt_125m_clm_ipex": {
+   "working_dir": "huggingface/pytorch/language-modeling/quantization",
+   "tune": {
+     "cmd": "bash run_tuning.sh",
+     "params": {
+       "topology": "opt_125m",
+       "task": "clm",
+       "approach": "static",
+       "backend": "ipex",
+       "output_model": "saved_results"
+     }
+   },
+   "benchmark": {
+     "cmd": "bash run_benchmark.sh",
+     "params": {
+       "topology": "opt_125m",
+       "task": "clm",
+       "approach": "static",
+       "backend": "ipex",
+       "mode": "accuracy",
+       "batch_size": "112",
+       "iters": "100",
+       "int8": "false",
+       "config": "saved_results"
+     }
+   }
+ },
    "opt_1.3b_clm_ipex": {
     "working_dir": "huggingface/pytorch/language-modeling/quantization",
     "tune": {

diff --git a/examples/huggingface/pytorch/language-modeling/quantization/README.md b/examples/huggingface/pytorch/language-modeling/quantization/README.md
@@ -48,9 +48,16 @@ python run_clm_no_trainer.py \
     --model EleutherAI/gpt-j-6B \
     --quantize \
     --approach weight_only \
-    --output_dir "saved_results" \
+    --weight_only_bits 4 \
+    --weight_only_group 128 \
+    --weight_only_scheme asym  \
+    --weight_only_algo RTN \
+    --weight_only_mse_range \
+    --output_dir "saved_results"
 ```
-**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN/AWQ[1]/GPTQ[2] algorithms. You can try it with `--approach weight_only`. `--awq` will trigger AWQ algorithm. `--gptq` will trigger GPTQ algorithm. For example, to run a GPTQ example, try the following command.
+**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md)
+
+
 ```bash
 python run_clm_no_trainer.py \
     --model EleutherAI/gpt-j-6B \
@@ -242,5 +249,5 @@ python run_mlm.py \
     --overwrite_output_dir
 ```
 
-[1]. Lin, Ji, et al. "AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration." arXiv preprint arXiv:2306.00978 (2023).
-[2]. Elias, Frantar, et al. "GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers." arXiv preprint arXiv:2210.17323 (2023).
+[1]. Elias, Frantar, et al. "GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers." arXiv preprint arXiv:2210.17323 (2023).
+[2]. Lin, Ji, et al. "AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration." arXiv preprint arXiv:2306.00978 (2023).
diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_benchmark.sh b/examples/huggingface/pytorch/language-modeling/quantization/run_benchmark.sh
@@ -107,16 +107,20 @@ function run_benchmark {
     elif [ "${topology}" = "falcon_7b_instruct" ]; then
         script="run_clm_no_trainer.py"
         model_name_or_path="tiiuae/falcon-7b-instruct"
-    elif [ "${topology}" = "opt_125m_weight_only" ]; then
+    elif [ "${topology}" = "opt_125m_weight_only"  -o \
+           "${topology}" = "opt_125m_weight_only_awq"  -o \
+           "${topology}" = "opt_125m_weight_only_gptq"  -o \
+           "${topology}" = "opt_125m_weight_only_teq" ]; then
         script="run_clm_no_trainer.py"
         model_name_or_path="facebook/opt-125m"
         lm_eval_tasks="lambada_openai"
         extra_cmd=$extra_cmd" --approach weight_only"
-    elif [ "${topology}" = "opt_125m_weight_only_awq" ]; then
+    elif [ "${topology}" = "opt_125m" ]; then
         script="run_clm_no_trainer.py"
         model_name_or_path="facebook/opt-125m"
-        lm_eval_tasks="lambada_openai"
-        extra_cmd=$extra_cmd" --approach weight_only"
+        if [ "${backend}" = "ipex" ]; then
+            extra_cmd=$extra_cmd" --ipex"
+        fi
     elif [ "${topology}" = "opt_1.3b" ]; then
         script="run_clm_no_trainer.py"
         model_name_or_path="facebook/opt-1.3b"

diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py b/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py
@@ -31,19 +31,6 @@
 )
 parser.add_argument("--approach", type=str, default='static', 
                     help="Select from ['dynamic', 'static', 'weight-only']")
-parser.add_argument("--sq", action="store_true")
-parser.add_argument("--alpha", default="auto",
-                    help="Smooth quant parameter.")
-# ============gptq configs===============
-parser.add_argument("--gptq_actorder", action="store_true", help="Whether to apply the activation order GPTQ heuristic.")
-parser.add_argument('--gptq_percdamp', type=float, default=.01, help='Percent of the average Hessian diagonal to use for dampening.')
-parser.add_argument('--gptq_block_size', type=int, default=128, help='Block size. sub weight matrix size to run GPTQ.')
-parser.add_argument('--gptq_nsamples', type=int, default=128, help='Number of calibration data samples.')
-parser.add_argument('--gptq_use_max_length', action="store_true", help='Set all sequence length to be same length of args.gptq_pad_max_length')
-parser.add_argument('--gptq_pad_max_length', type=int, default=2048, help='Calibration dataset sequence max length, this should align with your model config, and your dataset builder args: args.pad_max_length')
-# =======================================
-parser.add_argument("--weight_only_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ'], 
-                    help="Weight-only parameter.")
 parser.add_argument("--int8", action="store_true")
 parser.add_argument("--ipex", action="store_true", help="Use intel extension for pytorch.")
 parser.add_argument("--accuracy", action="store_true")
@@ -58,11 +45,28 @@
 parser.add_argument("--tasks", nargs='+', default=["lambada_openai",
     "hellaswag","winogrande","piqa","wikitext"],
     type=str, help="tasks list for accuracy validation")
+parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model")
+# ============SmoothQuant configs==============
+parser.add_argument("--sq", action="store_true")
+parser.add_argument("--alpha", default="auto", help="Smooth quant parameter.")
+# ============WeightOnly configs===============
+parser.add_argument("--weight_only_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ'], 
+                    help="Weight-only parameter.")
 parser.add_argument("--weight_only_bits", type=int, default=8)
 parser.add_argument("--weight_only_group", type=int, default=-1)
 parser.add_argument("--weight_only_scheme", default="sym")
+parser.add_argument("--weight_only_mse_range", action="store_true")
 parser.add_argument("--weight_only_sym_full_range", action="store_true")
-parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model")
+# =============GPTQ configs====================
+parser.add_argument("--gptq_actorder", action="store_true", help="Whether to apply the activation order GPTQ heuristic.")
+parser.add_argument('--gptq_percdamp', type=float, default=.01, help='Percent of the average Hessian diagonal to use for dampening.')
+parser.add_argument('--gptq_block_size', type=int, default=128, help='Block size. sub weight matrix size to run GPTQ.')
+parser.add_argument('--gptq_nsamples', type=int, default=128, help='Number of calibration data samples.')
+parser.add_argument('--gptq_use_max_length', action="store_true", help='Set all sequence length to be same length of args.gptq_pad_max_length')
+parser.add_argument('--gptq_pad_max_length', type=int, default=2048, help='Calibration dataset sequence max length, \
+                                                                           this should align with your model config, \
+                                                                           and your dataset builder args: args.pad_max_length')
+# =======================================
 
 args = parser.parse_args()
 if args.ipex:
@@ -231,7 +235,9 @@ def calib_func(prepared_model):
             prepared_model(calib_input[0])
 
     recipes = {}
+    eval_func = None
     from neural_compressor import PostTrainingQuantConfig, quantization
+    # specify the op_type_dict and op_name_dict
     if args.approach == 'weight_only':
         op_type_dict = {
             '.*':{ 	# re.match
@@ -243,55 +249,41 @@ def calib_func(prepared_model):
                 },
             },
         }
-        if args.weight_only_sym_full_range:
-            recipes.update({"rtn_args": {"sym_full_range": True}})
-    else:
-        if re.search("gpt", user_model.config.model_type):
-            op_type_dict = {
-                "add": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}},
-            }
-        else:
-            op_type_dict = {}
-    excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"]
-    if args.sq:
-        # alpha can be a float number of a list of float number.
-        args.alpha = args.alpha if args.alpha == "auto" else eval(args.alpha)
-        if re.search("falcon", user_model.config.model_type):
-            recipes = {"smooth_quant": True, "smooth_quant_args": {'alpha': args.alpha, 'folding': False}}
-        else:
-            recipes = {"smooth_quant": True, "smooth_quant_args": {'alpha': args.alpha}}
-        conf = PostTrainingQuantConfig(
-            backend="ipex" if args.ipex else "default",
-            approach=args.approach,
-            excluded_precisions=excluded_precisions,
-            op_type_dict=op_type_dict,
-            recipes=recipes,
-        )
-    elif args.weight_only_algo == "GPTQ":
-        recipes = {
-            'gptq_args': {
-                'percdamp': args.gptq_percdamp, 
-                'act_order':args.gptq_actorder, 
-                'block_size': args.gptq_block_size, 
-                'nsamples': args.gptq_nsamples, 
-                'use_max_length': args.gptq_use_max_length
-            }
+        op_name_dict={
+            'lm_head':{"weight": {'dtype': 'fp32'},},
+            'embed_out':{"weight": {'dtype': 'fp32'},},  # for dolly_v2
         }
+        recipes["rtn_args"] = {
+            "mse_range": args.weight_only_mse_range,
+            "sym_full_range": args.weight_only_sym_full_range,
+        }
+        # GPTQ: use assistive functions to modify calib_dataloader and calib_func
+        # TEQ: set calib_func=None, use default training func as calib_func
+        if args.weight_only_algo in ["GPTQ", "TEQ"]:
+            calib_func = None
+
         conf = PostTrainingQuantConfig(
-            backend="ipex" if args.ipex else "default",
             approach=args.approach,
-            excluded_precisions=excluded_precisions,
             op_type_dict=op_type_dict,
-            op_name_dict={
-                '.*lm_head':{ 	# re.match
-                    "weight": {
-                        'dtype': 'fp32'
-                    },
-                },
-            },
+            op_name_dict=op_name_dict,
             recipes=recipes,
         )
     else:
+        if re.search("gpt", user_model.config.model_type):
+            op_type_dict = {
+                "add": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}},
+            }
+        else:
+            op_type_dict = {}
+        excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"]
+        if args.sq:
+            # alpha can be a float number of a list of float number.
+            args.alpha = args.alpha if args.alpha == "auto" else eval(args.alpha)
+            if re.search("falcon", user_model.config.model_type):
+                recipes = {"smooth_quant": True, "smooth_quant_args": {'alpha': args.alpha, 'folding': False}}
+            else:
+                recipes = {"smooth_quant": True, "smooth_quant_args": {'alpha': args.alpha}}
+
         conf = PostTrainingQuantConfig(
             backend="ipex" if args.ipex else "default",
             approach=args.approach,
@@ -300,20 +292,13 @@ def calib_func(prepared_model):
             recipes=recipes,
         )
 
-    # when GPTQ is enabled: use assistive functions to modify calib_dataloader and calib_func
-    if args.weight_only_algo == "GPTQ":
-        calib_func = None
-    if args.weight_only_algo == 'TEQ':
-        # set calib_func=None, use default training func as calib_func
-        calib_func = None
-
-    eval_dataset = load_dataset('lambada', split='validation')
-    evaluator = Evaluator(eval_dataset, tokenizer)
-    def eval_func(model):
-        acc = evaluator.evaluate(model)
-        return acc
-    # eval_func should be set when tuning alpha.
-    eval_func = eval_func if isinstance(args.alpha, list) else None
+        # eval_func should be set when tuning alpha.
+        if isinstance(args.alpha, list):
+            eval_dataset = load_dataset('lambada', split='validation')
+            evaluator = Evaluator(eval_dataset, tokenizer)
+            def eval_func(model):
+                acc = evaluator.evaluate(model)
+                return acc
 
     q_model = quantization.fit(
         user_model,
@@ -339,7 +324,7 @@ def eval_func(model):
 
 if args.accuracy:
     user_model.eval()
-    from intel_extension_for_transformers.evaluation.lm_eval import evaluate
+    from intel_extension_for_transformers.llm.evaluation.lm_eval import evaluate
     results = evaluate(
         model="hf-causal",
         model_args='pretrained='+args.model+',tokenizer='+args.model+',dtype=float32',

diff --git a/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh b/examples/huggingface/pytorch/language-modeling/quantization/run_tuning.sh
@@ -145,13 +145,36 @@ function run_tuning {
         DATASET_NAME="NeelNanda/pile-10k"
         model_name_or_path="facebook/opt-125m"
         approach="weight_only"
-        extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo RTN"
+        extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo RTN --weight_only_mse_range"
     elif [ "${topology}" = "opt_125m_weight_only_awq" ]; then
         script="run_clm_no_trainer.py"
         DATASET_NAME="NeelNanda/pile-10k"
         model_name_or_path="facebook/opt-125m"
         approach="weight_only"
         extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo AWQ --calib_iters 128"
+    elif [ "${topology}" = "opt_125m_weight_only_gptq" ]; then
+        script="run_clm_no_trainer.py"
+        DATASET_NAME="NeelNanda/pile-10k"
+        model_name_or_path="facebook/opt-125m"
+        approach="weight_only"
+        extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo GPTQ"
+    elif [ "${topology}" = "opt_125m_weight_only_teq" ]; then
+        script="run_clm_no_trainer.py"
+        DATASET_NAME="NeelNanda/pile-10k"
+        model_name_or_path="facebook/opt-125m"
+        approach="weight_only"
+        extra_cmd=$extra_cmd" --approach weight_only --weight_only_algo TEQ"
+    elif [ "${topology}" = "opt_125m" ]; then
+        if [ "${backend}" = "ipex" ]; then
+            extra_cmd=$extra_cmd" --ipex"
+        fi
+        script="run_clm_no_trainer.py"
+        DATASET_NAME="NeelNanda/pile-10k"
+        model_name_or_path="facebook/opt-125m"
+        approach="PostTrainingStatic"
+        extra_cmd=$extra_cmd" --int8_bf16_mixed"
+	    alpha=0.8
+        extra_cmd=$extra_cmd" --sq --alpha "${alpha}
     elif [ "${topology}" = "opt_1.3b" ]; then
         if [ "${backend}" = "ipex" ]; then
             extra_cmd=$extra_cmd" --ipex"