From 6068d069964cecb9919dec289d740e056fbd6a3f Mon Sep 17 00:00:00 2001
From: changwangss <chang1.wang@intel.com>
Date: Thu, 14 Sep 2023 04:00:50 -0700
Subject: [PATCH 01/19] enable text-generation with NeuralChat API

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 .../quantization/run_generation.py            | 152 ++----------------
 .../llm/quantization/optimization.py          | 144 ++++++++++++++++-
 .../neural_chat/config.py                     |   8 +
 3 files changed, 164 insertions(+), 140 deletions(-)

diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py
index 4774380dc58..6d7ed135f9d 100644
--- a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py
+++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py
@@ -17,6 +17,9 @@
 import numpy as np
 from itertools import chain
 from optimum.utils import NormalizedConfigManager
+# ipex dependency
+import intel_extension_for_pytorch as ipex
+from optimum.intel.generation.modeling import TSModelForCausalLM
 
 
 parser = argparse.ArgumentParser()
@@ -58,6 +61,7 @@
 parser.add_argument("--tasks", nargs='+', default=["winogrande", "copa", "piqa", "rte", "hellaswag", \
                     "openbookqa", "lambada_openai", "lambada_standard", "wikitext"], type=str, \
                     help="tasks list for accuracy validation")
+
 args = parser.parse_args()
 
 calib_size = 1
@@ -82,6 +86,7 @@
        config=config
 )
 
+# tokenizer
 if config.model_type == "llama":
     from transformers import LlamaTokenizer
     tokenizer = LlamaTokenizer.from_pretrained(args.model)
@@ -92,125 +97,10 @@
 user_model = user_model.to(memory_format=torch.channels_last)
 user_model.eval()
 
-if args.ipex:
-    import intel_extension_for_pytorch as ipex
-    from optimum.intel.generation.modeling import TSModelForCausalLM
-
 # quantize
 if args.quantize:
-    def generate_dummy_past_key_values(input_bs, user_model):
-        normalized_config = NormalizedConfigManager.get_normalized_config_class(
-            user_model.config.model_type
-        )(user_model.config)
-        nb_pkv = 2
-        num_layers = normalized_config.num_layers
-        num_attention_heads = normalized_config.num_attention_heads
-        hidden_size = normalized_config.hidden_size
-        d_k = hidden_size // num_attention_heads
-
-        if user_model.config.model_type == "bloom":
-            pkv = ()
-            for nb_pkv in range(nb_pkv):
-                if nb_pkv % 2 == 0:
-                    new_shape = [input_bs * num_attention_heads, d_k, 1]
-                else:
-                    new_shape = [input_bs * num_attention_heads, 1, d_k]
-                pkv = pkv + (torch.ones(size=new_shape),)
-        else:
-            new_shape = [input_bs, num_attention_heads, 1, d_k]
-            dummy_tensor = torch.ones(size=new_shape)
-            pkv = tuple(dummy_tensor for _ in range(nb_pkv))
-        past_key_values = tuple(tuple(pkv) for _ in range(num_layers))
-        return past_key_values
-
-    class Evaluator:
-        def __init__(
-            self,
-            dataset,
-            tokenizer,
-            batch_size=8,
-            pad_val=1,
-            pad_max=512,
-            is_calib=False,
-        ):
-            self.dataset = dataset
-            self.tokenizer = tokenizer
-            self.batch_size = batch_size
-            self.pad_val = pad_val
-            self.pad_max = pad_max
-            self.is_calib = is_calib
-
-            # tokenize the dataset
-            self.dataset = self.dataset.map(self.tokenize_function, batched=True)
-            self.dataset.set_format(type="torch", columns=["input_ids"])
-
-        @torch.no_grad()
-        def tokenize_function(self, examples):
-            example = self.tokenizer(examples["text"])
-            return example
-
-        @torch.no_grad()
-        def collate_batch(self, batch):
-            input_ids_padded = []
-            last_ind = []
-            for text in batch:
-                input_ids = text["input_ids"]
-                pad_len = self.pad_max - input_ids.shape[0]
-                last_ind.append(input_ids.shape[0] - 1)
-                if self.is_calib:
-                    input_ids = (
-                        input_ids[: self.pad_max]
-                        if len(input_ids) > self.pad_max
-                        else input_ids
-                    )
-                else:
-                    input_ids = pad(input_ids, (0, pad_len), value=self.pad_val)
-                input_ids_padded.append(input_ids)
-            return (
-                torch.vstack(input_ids_padded),
-                torch.tensor(last_ind),
-            )
-
-    calib_dataset = load_dataset(args.dataset, split="train")
-    calib_dataset = calib_dataset.shuffle(seed=42)
-    calib_evaluator = Evaluator(
-        calib_dataset,
-        tokenizer,
-        args.batch_size,
-        pad_max=args.pad_max_length,
-        is_calib=True,
-    )
-    calib_dataloader = DataLoader(
-        calib_evaluator.dataset,
-        batch_size=calib_size,
-        shuffle=False,
-        collate_fn=calib_evaluator.collate_batch,
-    )
-    input_ids = user_model.dummy_inputs["input_ids"]
-    input_bs, input_len = input_ids.shape
-    past_key_values = generate_dummy_past_key_values(input_bs, user_model)
-    attention_mask = torch.ones(input_bs, input_len + 1)
-    attention_mask[:,0] = 0
-    example_inputs = (input_ids, tuple(past_key_values), attention_mask)
-    # do inference to check example_inputs formats
-    user_model(*example_inputs)
-
-    def calib_func(prepared_model):
-        for i, (input_ids, last_ind) in enumerate(calib_dataloader):
-            input_bs, input_len = input_ids.shape
-            past_key_values = generate_dummy_past_key_values(input_bs, user_model)
-            attention_mask = torch.ones(input_bs, input_len + 1)
-            attention_mask[:,0] = 0
-            if i >= args.calib_iters:
-                break
-            prepared_model(
-                input_ids=input_ids,
-                past_key_values=past_key_values,
-                attention_mask=attention_mask,
-            )
-
-    from neural_compressor import PostTrainingQuantConfig, quantization
-
+    from intel_extension_for_transformers.neural_chat.config import SmoothQuantConfig
+    from intel_extension_for_transformers.llm.quantization.optimization import Optimization
     if re.search("gptj", user_model.config.model_type) or re.search(
         "gpt_neox", user_model.config.model_type
     ):
@@ -225,30 +115,18 @@ def calib_func(prepared_model):
     else:
         op_type_dict = {}
     excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"]
-    if args.sq:
-        args.alpha = args.alpha if args.alpha == "auto" else float(args.alpha)
-        recipes = {"smooth_quant": True, "smooth_quant_args": {"alpha": args.alpha}}
-        conf = PostTrainingQuantConfig(
-            backend="ipex" if args.ipex else "default",
-            excluded_precisions=excluded_precisions,
-            op_type_dict=op_type_dict,
-            recipes=recipes,
-            example_inputs=example_inputs,
-        )
-    else:
-        conf = PostTrainingQuantConfig(
-            backend="ipex" if args.ipex else "default",
-            excluded_precisions=excluded_precisions,
-            op_type_dict=op_type_dict,
-            example_inputs=example_inputs,
-        )
+    config = SmoothQuantConfig(alpha=float(args.alpha),
+                               op_type_dict=op_type_dict,
+                               excluded_precisions=excluded_precisions
+                               )
     # save config
     user_model.config.save_pretrained(args.output_dir)
-    q_model = quantization.fit(
+    optimization = Optimization(config)
+    q_model = optimization.optimize(
         user_model,
-        conf,
-        calib_func=calib_func,
+        tokenizer
     )
+    # save model
     q_model.save(args.output_dir)
 
 # Generation
diff --git a/intel_extension_for_transformers/llm/quantization/optimization.py b/intel_extension_for_transformers/llm/quantization/optimization.py
index 5db84390df2..52d5695b03d 100644
--- a/intel_extension_for_transformers/llm/quantization/optimization.py
+++ b/intel_extension_for_transformers/llm/quantization/optimization.py
@@ -15,21 +15,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from doctest import Example
 from typing import Union
+from venv import logger
+from intel_extension_for_transformers.transformers.utils.utility import LazyImport
 from intel_extension_for_transformers.neural_chat.config import (
     AMPConfig,
     WeightOnlyQuantizationConfig,
-    BitsAndBytesConfig
+    BitsAndBytesConfig,
+    SmoothQuantConfig
 )
+import logging
+logger = logging.getLogger(__name__)
+torch = LazyImport("torch")
 
 class Optimization:
     def __init__(
             self,
-            optimization_config: Union[AMPConfig, WeightOnlyQuantizationConfig, BitsAndBytesConfig]
+            optimization_config: Union[AMPConfig, WeightOnlyQuantizationConfig, BitsAndBytesConfig, SmoothQuantConfig]
         ):
         self.optimization_config = optimization_config
 
-    def optimize(self, model):
+    def optimize(self, model, tokenizer=None, calib_func=None):
+        """
+        Optimize the model with a given config.
+        """
         optimized_model = model
         config = self.optimization_config
         if isinstance(config, WeightOnlyQuantizationConfig):
@@ -55,4 +65,132 @@ def optimize(self, model):
                 model,
                 conf,
             ).model
+        elif isinstance(config, SmoothQuantConfig):
+            print("Applying SmoothQuant.")
+            if tokenizer is None:
+                logger.error("Please provide the tokenizer. \n" +
+                                "from transformer import AutoTokenizer \n" +
+                                "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n" +
+                                "Or provide calib_func directly."
+                                )
+            if calib_func is None:
+                from datasets import load_dataset
+                from torch.utils.data import DataLoader
+                calib_dataset = load_dataset("NeelNanda/pile-10k", split="train")
+                calib_dataset = calib_dataset.shuffle(seed=42)
+
+                def tokenize_function(examples):
+                    return tokenizer(examples["text"])
+
+                tokenized_dataset = calib_dataset.map(tokenize_function, batched=True)
+                tokenized_dataset.set_format(type="torch", columns=["input_ids"])
+
+                def collate_batch(batch):
+                    input_ids_padded = []
+                    for text in batch:
+                        input_ids = text["input_ids"]
+                        input_ids = (
+                                input_ids[: 512]
+                                if len(input_ids) > 512
+                                else input_ids
+                            )
+                        input_ids_padded.append(input_ids)
+                    return (torch.vstack(input_ids_padded))
+                calib_dataloader = DataLoader(
+                    tokenized_dataset,
+                    batch_size=1,
+                    shuffle=False,
+                    collate_fn=collate_batch,
+                )
+            def default_calib_func(model):
+                """
+                This is the default calibration function, the dataset is NeelNanda/pile-10k,
+                the default calib_iters is 100.
+                """
+
+                for i, (input_ids) in enumerate(calib_dataloader):
+                    input_bs, input_len = input_ids.shape
+                    past_key_values = self.generate_dummy_past_key_values(input_bs, model)
+                    attention_mask = torch.ones(input_bs, input_len + 1)
+                    attention_mask[:,0] = 0
+                    if i >= 100:
+                        break
+                    model(
+                        input_ids=input_ids,
+                        past_key_values=past_key_values,
+                        attention_mask=attention_mask,
+                    )
+            recipes = {"smooth_quant": True, "smooth_quant_args": {"alpha": config.alpha}}
+            example_inputs = self.get_example_inputs_for_trace(model)
+            from neural_compressor import PostTrainingQuantConfig, quantization
+            conf = PostTrainingQuantConfig(
+                backend="ipex",
+                excluded_precisions=config.excluded_precisions,
+                op_type_dict=config.op_type_dict,
+                recipes=recipes,
+                example_inputs=example_inputs,
+            )
+            if calib_func is None:
+                logger.info("The default calibration funcation is used, " +
+                            "the calibration dataset is NeelNanda/pile-10k," +
+                            "batchsize is 1 and calibration iteration is 100.")
+                calib_func = default_calib_func
+            else:
+                calib_func = calib_func
+            optimized_model = quantization.fit(
+                model,
+                conf,
+                calib_func=calib_func,
+            )
         return optimized_model
+
+    def generate_dummy_past_key_values(self, input_bs, model):
+        """
+            Generate the dummy past_key_values.
+        """
+        from optimum.utils import NormalizedConfigManager
+        normalized_config = NormalizedConfigManager.get_normalized_config_class(
+            model.config.model_type
+        )(model.config)
+        nb_pkv = 2
+        num_layers = normalized_config.num_layers
+        num_attention_heads = normalized_config.num_attention_heads
+        hidden_size = normalized_config.hidden_size
+        d_k = hidden_size // num_attention_heads
+
+        if model.config.model_type == "bloom":
+            pkv = ()
+            for nb_pkv in range(nb_pkv):
+                if nb_pkv % 2 == 0:
+                    new_shape = [input_bs * num_attention_heads, d_k, 1]
+                else:
+                    new_shape = [input_bs * num_attention_heads, 1, d_k]
+                pkv = pkv + (torch.ones(size=new_shape),)
+        else:
+            new_shape = [input_bs, num_attention_heads, 1, d_k]
+            dummy_tensor = torch.ones(size=new_shape)
+            pkv = tuple(dummy_tensor for _ in range(nb_pkv))
+        past_key_values = tuple(tuple(pkv) for _ in range(num_layers))
+        return past_key_values
+
+    def get_example_inputs_for_trace(self, model, return_type="tuple"):
+        """
+            Generate the example_input for tracing, support models load from AutoModelForCausalLM.
+
+        """
+        input_ids = model.dummy_inputs["input_ids"]
+        input_bs, input_len = input_ids.shape
+        past_key_values = self.generate_dummy_past_key_values(input_bs, model)
+        attention_mask = torch.ones(input_bs, input_len + 1)
+        attention_mask[:,0] = 0
+        example_inputs = (input_ids, tuple(past_key_values), attention_mask)
+        # do inference to check example_inputs formats
+        model(*example_inputs)
+        if return_type != "tuple":
+            example_inputs = {
+                "input_ids": input_ids,
+                "past_key_values": tuple(past_key_values),
+                "attention_mask": attention_mask
+            }
+        return example_inputs
+
diff --git a/intel_extension_for_transformers/neural_chat/config.py b/intel_extension_for_transformers/neural_chat/config.py
index 7248dc0bcb7..e8654d22978 100644
--- a/intel_extension_for_transformers/neural_chat/config.py
+++ b/intel_extension_for_transformers/neural_chat/config.py
@@ -417,6 +417,14 @@ class WeightOnlyQuantizationConfig:
 @dataclass
 class AMPConfig:
     dtype: str = 'bfloat16'
+    op_type_dict = None
+    
+
+@dataclass
+class SmoothQuantConfig:
+    alpha: float = 0.5
+    op_type_dict: dict = None  
+    excluded_precisions: dict = None
 
 class PipelineConfig:
     def __init__(self,

From fccc16ade37219a55a062b5e33c07e2e50f39b74 Mon Sep 17 00:00:00 2001
From: changwangss <chang1.wang@intel.com>
Date: Thu, 14 Sep 2023 04:28:49 -0700
Subject: [PATCH 02/19] fix wrong typing and hide import

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 .../text-generation/quantization/run_generation.py     |  7 +------
 .../llm/quantization/optimization.py                   | 10 ++++++----
 intel_extension_for_transformers/neural_chat/config.py |  4 +---
 3 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py
index 6d7ed135f9d..2a28ae127f3 100644
--- a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py
+++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py
@@ -17,8 +17,6 @@
 import numpy as np
 from itertools import chain
 from optimum.utils import NormalizedConfigManager
-# ipex dependency
-import intel_extension_for_pytorch as ipex
 from optimum.intel.generation.modeling import TSModelForCausalLM
 
 
@@ -37,13 +35,10 @@
 )
 parser.add_argument("--output_dir", nargs="?", default="./saved_results")
 parser.add_argument("--quantize", action="store_true")
-parser.add_argument("--ipex", action="store_true")
-parser.add_argument("--sq", action="store_true")
 parser.add_argument("--alpha", default="auto", help="Smooth quant parameter.")
 parser.add_argument(
     "--pad_max_length", default=512, type=int, help="Pad input ids to max length."
 )
-parser.add_argument("--calib_iters", default=512, type=int, help="calibration iters.")
 parser.add_argument("--int8", action="store_true")
 parser.add_argument(
     "--int8_bf16_mixed",
@@ -70,7 +65,7 @@
 config = AutoConfig.from_pretrained(
        args.model,
        torchscript=True
-       if args.ipex
+       if args.quantize
        else False,  # torchscript will force `return_dict=False` to avoid jit errors
        use_cache=True, # to use kv cache.
        trust_remote_code=args.trust_remote_code,
diff --git a/intel_extension_for_transformers/llm/quantization/optimization.py b/intel_extension_for_transformers/llm/quantization/optimization.py
index 52d5695b03d..e0b400c3499 100644
--- a/intel_extension_for_transformers/llm/quantization/optimization.py
+++ b/intel_extension_for_transformers/llm/quantization/optimization.py
@@ -67,12 +67,14 @@ def optimize(self, model, tokenizer=None, calib_func=None):
             ).model
         elif isinstance(config, SmoothQuantConfig):
             print("Applying SmoothQuant.")
+            import intel_extension_for_pytorch
             if tokenizer is None:
-                logger.error("Please provide the tokenizer. \n" +
-                                "from transformer import AutoTokenizer \n" +
-                                "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n" +
-                                "Or provide calib_func directly."
+                logger.error("Please provide the tokenizer or provide calib_func directly," + 
+                                " the following is how to get tokenizer. \n" +
+                                " from transformer import AutoTokenizer \n" +
+                                " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n"
                                 )
+                exit(0)
             if calib_func is None:
                 from datasets import load_dataset
                 from torch.utils.data import DataLoader
diff --git a/intel_extension_for_transformers/neural_chat/config.py b/intel_extension_for_transformers/neural_chat/config.py
index e8654d22978..13e0c3494db 100644
--- a/intel_extension_for_transformers/neural_chat/config.py
+++ b/intel_extension_for_transformers/neural_chat/config.py
@@ -416,9 +416,7 @@ class WeightOnlyQuantizationConfig:
 
 @dataclass
 class AMPConfig:
-    dtype: str = 'bfloat16'
-    op_type_dict = None
-    
+    dtype: str = 'bfloat16'    
 
 @dataclass
 class SmoothQuantConfig:

From 4e989a506be26ac6ff71b69a4b1948e314cb01dc Mon Sep 17 00:00:00 2001
From: "Wang, Chang" <chang1.wang@intel.com>
Date: Thu, 14 Sep 2023 19:36:09 +0800
Subject: [PATCH 03/19] improve import check

---
 .../llm/quantization/optimization.py                      | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/intel_extension_for_transformers/llm/quantization/optimization.py b/intel_extension_for_transformers/llm/quantization/optimization.py
index e0b400c3499..07796efa16c 100644
--- a/intel_extension_for_transformers/llm/quantization/optimization.py
+++ b/intel_extension_for_transformers/llm/quantization/optimization.py
@@ -26,6 +26,7 @@
     SmoothQuantConfig
 )
 import logging
+import warnings
 logger = logging.getLogger(__name__)
 torch = LazyImport("torch")
 
@@ -67,7 +68,12 @@ def optimize(self, model, tokenizer=None, calib_func=None):
             ).model
         elif isinstance(config, SmoothQuantConfig):
             print("Applying SmoothQuant.")
-            import intel_extension_for_pytorch
+            try:
+                import intel_extension_for_pytorch as ipex
+            except ImportError:
+                warnings.warn(
+                    "Please install Intel Extension for PyTorch to accelerate the model inference."
+                )
             if tokenizer is None:
                 logger.error("Please provide the tokenizer or provide calib_func directly," + 
                                 " the following is how to get tokenizer. \n" +

From 40d20eb94a223bd914bd1898dea1d0f8e53e688a Mon Sep 17 00:00:00 2001
From: changwangss <chang1.wang@intel.com>
Date: Fri, 15 Sep 2023 01:11:16 -0700
Subject: [PATCH 04/19] rebase main

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 .../quantization/run_generation.py            |  95 ++++---
 .../transformers/__init__.py                  |  36 +--
 .../transformers/modeling/modeling_causal.py  | 250 ++++++++++++++++++
 .../transformers/utils/__init__.py            |   9 +-
 .../transformers/utils/utility.py             |  51 ++++
 5 files changed, 377 insertions(+), 64 deletions(-)
 create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_causal.py

diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py
index 2a28ae127f3..b0529869455 100644
--- a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py
+++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py
@@ -12,7 +12,6 @@
 from torch.utils.data import DataLoader
 from transformers import AutoConfig, AutoTokenizer, PretrainedConfig
 from transformers.utils import check_min_version
-from intel_extension_for_transformers.transformers import AutoModelForCausalLM
 import transformers
 import numpy as np
 from itertools import chain
@@ -63,46 +62,44 @@
 
 # model
 config = AutoConfig.from_pretrained(
-       args.model,
-       torchscript=True
-       if args.quantize
-       else False,  # torchscript will force `return_dict=False` to avoid jit errors
-       use_cache=True, # to use kv cache.
-       trust_remote_code=args.trust_remote_code,
-       revision=args.revision
-       )
+      args.model,
+      torchscript=True
+      if args.quantize
+      else False,  # torchscript will force `return_dict=False` to avoid jit errors
+      use_cache=True, # to use kv cache.
+      trust_remote_code=args.trust_remote_code,
+      revision=args.revision,
+      )
+
 # transformers version >= 4.32.0 contained the mpt modeling definition.
 # https://github.com/huggingface/transformers/blob/main/src/transformers/models/mpt/modeling_mpt.py
 if config.model_type == "mpt":
     check_min_version("4.32.0")
 
-user_model = AutoModelForCausalLM.from_pretrained(
-       args.model,
-       config=config
-)
-
 # tokenizer
 if config.model_type == "llama":
-    from transformers import LlamaTokenizer
-    tokenizer = LlamaTokenizer.from_pretrained(args.model)
+   from transformers import LlamaTokenizer
+   tokenizer = LlamaTokenizer.from_pretrained(args.model)
 else:
-    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
-
-# to channels last
-user_model = user_model.to(memory_format=torch.channels_last)
-user_model.eval()
+   tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
 
 # quantize
 if args.quantize:
-    from intel_extension_for_transformers.neural_chat.config import SmoothQuantConfig
-    from intel_extension_for_transformers.llm.quantization.optimization import Optimization
-    if re.search("gptj", user_model.config.model_type) or re.search(
-        "gpt_neox", user_model.config.model_type
+    from intel_extension_for_transformers.transformers import (
+        AMPConfig,
+        WeightOnlyQuantizationConfig,
+        SmoothQuantConfig,
+        BitsAndBytesConfig
+
+    ) 
+    from intel_extension_for_transformers.transformers import AutoModelForCausalLM
+    if re.search("gptj", config.model_type) or re.search(
+        "gpt_neox", config.model_type
     ):
         op_type_dict = {
             "add": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}},
         }
-    elif re.search("mpt", user_model.config.model_type):
+    elif re.search("mpt", config.model_type):
         op_type_dict = {
             "add": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}},
             "<built-in function linear>":{"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}},
@@ -110,19 +107,41 @@
     else:
         op_type_dict = {}
     excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"]
-    config = SmoothQuantConfig(alpha=float(args.alpha),
-                               op_type_dict=op_type_dict,
-                               excluded_precisions=excluded_precisions
+    sq_config = SmoothQuantConfig(
+                                tokenizer=tokenizer,  # either two of one, tokenizer or calib_func
+                                alpha=float(args.alpha),    # default is 0.5
+                                op_type_dict=op_type_dict,  # default is {}
+                                excluded_precisions=excluded_precisions,  # default is []
                                )
-    # save config
-    user_model.config.save_pretrained(args.output_dir)
-    optimization = Optimization(config)
-    q_model = optimization.optimize(
-        user_model,
-        tokenizer
-    )
-    # save model
-    q_model.save(args.output_dir)
+    # smooth-quant
+    q_model = AutoModelForCausalLM.from_pretrained(args.model,
+                                                   quantization_config=sq_config
+                                               )
+    print("sq done.")
+    # weight-only
+    woq_config = WeightOnlyQuantizationConfig(algorithm="RTN", # default is "RTN"
+                                              bits=8, # default is 8
+                                              group_size=-1, # default is -1
+                                              scheme="sym", # default is sym
+                                              enable_full_range=True # default is True
+                                              ) 
+    woq_model = AutoModelForCausalLM.from_pretrained(args.model,
+                                                quantization_config=woq_config
+                                            )
+    print("woq done.")
+    # amp
+    amp_config = AMPConfig(dtype="bfloat16") # default is bfloat16
+    amp_model = AutoModelForCausalLM.from_pretrained(args.model,
+                                                quantization_config=amp_config
+                                            )
+    print("amp done.")
+    # bitsandbytes
+    bab_config = BitsAndBytesConfig()
+    bab_model = AutoModelForCausalLM.from_pretrained(args.model,
+                                                quantization_config=bab_config
+                                            )
+    print("bitsandbytes done.")
+
 
 # Generation
 generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=4)
diff --git a/intel_extension_for_transformers/transformers/__init__.py b/intel_extension_for_transformers/transformers/__init__.py
index d78c4288971..fbec8b11cb7 100644
--- a/intel_extension_for_transformers/transformers/__init__.py
+++ b/intel_extension_for_transformers/transformers/__init__.py
@@ -16,33 +16,19 @@
 # limitations under the License.
 
 
-from .config import (
-    AutoDistillationConfig,
-    DistillationConfig,
-    FlashDistillationConfig,
-    TFDistillationConfig,
-    NASConfig,
-    Provider,
-    PruningConfig,
-    QuantizationConfig,
-    WEIGHTS_NAME,
-    DynamicLengthConfig,
-    BenchmarkConfig,
-    PrunerV2,
-    
-)
-from .distillation import (
-    DistillationCriterionMode,
-    SUPPORTED_DISTILLATION_CRITERION_MODE,
-)
-from .modeling import OptimizedModel, AutoModelForCausalLM
+from .config import (WEIGHTS_NAME, AutoDistillationConfig, BenchmarkConfig,
+                     DistillationConfig, DynamicLengthConfig,
+                     FlashDistillationConfig, NASConfig, Provider, PrunerV2,
+                     PruningConfig, QuantizationConfig, TFDistillationConfig)
+from .distillation import (SUPPORTED_DISTILLATION_CRITERION_MODE,
+                           DistillationCriterionMode)
 from .mixture.auto_distillation import AutoDistillation
+from .modeling import AutoModelForCausalLM, OptimizedModel
 from .nas import NAS
 from .optimizer import NoTrainerOptimizer, Orchestrate_optimizer
 from .optimizer_tf import TFOptimization
-from .pruning import PrunerConfig, PruningMode, SUPPORTED_PRUNING_MODE
-from .quantization import QuantizationMode, SUPPORTED_QUANT_MODE
-from .utils import metrics
-from .utils import objectives
+from .pruning import SUPPORTED_PRUNING_MODE, PrunerConfig, PruningMode
+from .quantization import SUPPORTED_QUANT_MODE, QuantizationMode
+from .utils import (AMPConfig, BitsAndBytesConfig, SmoothQuantConfig,
+                    WeightOnlyQuantizationConfig, metrics, objectives)
 from .utils.utility import LazyImport
-
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_causal.py b/intel_extension_for_transformers/transformers/modeling/modeling_causal.py
new file mode 100644
index 00000000000..e090d092ad2
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_causal.py
@@ -0,0 +1,250 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# coding=utf-8
+# Copyright 2021 The EleutherAI and HuggingFace Teams. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+
+from transformers import AutoConfig, PretrainedConfig
+from transformers.dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING
+from transformers.models.auto.auto_factory import _get_model_class
+from intel_extension_for_transformers.transformers.utils.utility import (
+    LazyImport,
+    generate_dummy_past_key_values,
+    get_example_inputs_for_trace
+)
+
+
+from intel_extension_for_transformers.transformers import (
+    AMPConfig,
+    WeightOnlyQuantizationConfig,
+    SmoothQuantConfig
+)
+import logging
+import warnings
+logger = logging.getLogger(__name__)
+torch = LazyImport("torch")
+
+
+class _BaseAutoModelClass:
+    # Base class for auto models.
+    _model_mapping = None
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        import intel_extension_for_transformers.transformers.modeling.modeling_map
+        config = kwargs.pop("config", None)
+        calib_func = kwargs.pop("calib_func", None)
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
+        kwargs["_from_auto"] = True
+        hub_kwargs_names = [
+            "cache_dir",
+            "code_revision",
+            "force_download",
+            "local_files_only",
+            "proxies",
+            "resume_download",
+            "revision",
+            "subfolder",
+            "use_auth_token",
+        ]
+        hub_kwargs = {name: kwargs.pop(name) for name in hub_kwargs_names if name in kwargs}
+
+        if not isinstance(config, PretrainedConfig):
+            kwargs_orig = copy.deepcopy(kwargs)
+            # ensure not to pollute the config object with torch_dtype="auto" - since it's
+            # meaningless in the context of the config object - torch.dtype values are acceptable
+            if kwargs.get("torch_dtype", None) == "auto":
+                _ = kwargs.pop("torch_dtype")
+            # to not overwrite the quantization_config if config has a quantization_config
+
+            if kwargs.get("quantization_config", None) is not None:
+                _ = kwargs.pop("quantization_config")
+
+            config, kwargs = AutoConfig.from_pretrained(
+                pretrained_model_name_or_path,
+                return_unused_kwargs=True,
+                trust_remote_code=trust_remote_code,
+                **hub_kwargs,
+                **kwargs,
+            )
+
+            # if torch_dtype=auto was passed here, ensure to pass it on
+            if kwargs_orig.get("torch_dtype", None) == "auto":
+                kwargs["torch_dtype"] = "auto"
+            quantization_config = kwargs_orig.get("quantization_config", None)
+            if quantization_config is not None and not (isinstance(quantization_config, SmoothQuantConfig) or 
+                                                        isinstance(quantization_config, AMPConfig) or
+                                                        isinstance(quantization_config, WeightOnlyQuantizationConfig)
+                                                        ):
+                kwargs["quantization_config"] = kwargs_orig["quantization_config"]
+            if isinstance(quantization_config, AMPConfig):
+                config.torch_dtype=torch.bfloat16
+
+        has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map
+        has_local_code = type(config) in cls._model_mapping.keys()
+        trust_remote_code = resolve_trust_remote_code(
+            trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
+        )
+        if has_remote_code and trust_remote_code:
+            class_ref = config.auto_map[cls.__name__]
+            model_class = get_class_from_dynamic_module(
+                class_ref, pretrained_model_name_or_path, **hub_kwargs, **kwargs
+            )
+            _ = hub_kwargs.pop("code_revision", None)
+            model = model_class.from_pretrained(
+                pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
+            )
+        elif type(config) in cls._model_mapping.keys():
+            model_class = _get_model_class(config, cls._model_mapping)
+            model =  model_class.from_pretrained(
+                pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
+            )
+        else:
+            raise ValueError(
+                f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
+                f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
+            )
+        model.eval()
+        if isinstance(quantization_config, WeightOnlyQuantizationConfig):
+            logger.info("Applying Weight Only Quantization.")
+            from neural_compressor import PostTrainingQuantConfig, quantization
+            op_type_dict = {
+                '.*':{ 	# re.match
+                    "weight": {
+                        'bits': quantization_config.bits, # 1-8 bits
+                        'group_size': quantization_config.group_size,  # -1 (per-channel)
+                        'scheme': quantization_config.scheme, # sym/asym
+                        'algorithm': quantization_config.algorithm, # RTN/AWQ/TEQ
+                    },
+                },
+            }
+            recipes = {"rtn_args": {"enable_full_range": quantization_config.enable_full_range}}
+            conf = PostTrainingQuantConfig(
+                approach='weight_only',
+                op_type_dict=op_type_dict,
+                recipes=recipes,
+            )
+            model.config.torchscript = True
+            model = quantization.fit(
+                model,
+                conf,
+            ).model
+        elif isinstance(quantization_config, SmoothQuantConfig):
+            logger.info("Applying SmoothQuant.")
+            try:
+                import intel_extension_for_pytorch as ipex
+            except ImportError:
+                warnings.warn(
+                    "Please install Intel Extension for PyTorch to accelerate the model inference."
+                )
+            if quantization_config.tokenizer is None:
+                logger.error("Please provide the tokenizer or provide calib_func directly," + 
+                                " the following is how to get tokenizer. \n" +
+                                " from transformer import AutoTokenizer \n" +
+                                " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n"
+                                )
+                exit(0)
+            if calib_func is None:
+                from datasets import load_dataset
+                from torch.utils.data import DataLoader
+                calib_dataset = load_dataset("NeelNanda/pile-10k", split="train")
+                calib_dataset = calib_dataset.shuffle(seed=42)
+
+                def tokenize_function(examples):
+                    return quantization_config.tokenizer(examples["text"])
+
+                tokenized_dataset = calib_dataset.map(tokenize_function, batched=True)
+                tokenized_dataset.set_format(type="torch", columns=["input_ids"])
+
+                def collate_batch(batch):
+                    input_ids_padded = []
+                    for text in batch:
+                        input_ids = text["input_ids"]
+                        input_ids = (
+                                input_ids[: 512]
+                                if len(input_ids) > 512
+                                else input_ids
+                            )
+                        input_ids_padded.append(input_ids)
+                    return (torch.vstack(input_ids_padded))
+                calib_dataloader = DataLoader(
+                    tokenized_dataset,
+                    batch_size=1,
+                    shuffle=False,
+                    collate_fn=collate_batch,
+                )
+            def default_calib_func(model):
+                """
+                This is the default calibration function, the dataset is NeelNanda/pile-10k,
+                the default calib_iters is 100.
+                """
+
+                for i, (input_ids) in enumerate(calib_dataloader):
+                    input_bs, input_len = input_ids.shape
+                    past_key_values = generate_dummy_past_key_values(input_bs, model)
+                    attention_mask = torch.ones(input_bs, input_len + 1)
+                    attention_mask[:,0] = 0
+                    if i >= 100:
+                        break
+                    model(
+                        input_ids=input_ids,
+                        past_key_values=past_key_values,
+                        attention_mask=attention_mask,
+                    )
+            recipes = {"smooth_quant": True, "smooth_quant_args": {"alpha": quantization_config.alpha}}
+            example_inputs = get_example_inputs_for_trace(model)
+            from neural_compressor import PostTrainingQuantConfig, quantization
+            conf = PostTrainingQuantConfig(
+                backend="ipex",
+                excluded_precisions=quantization_config.excluded_precisions,
+                op_type_dict=quantization_config.op_type_dict,
+                recipes=recipes,
+                example_inputs=example_inputs,
+            )
+            if calib_func is None:
+                logger.info("The default calibration funcation is used, " +
+                            "the calibration dataset is NeelNanda/pile-10k," +
+                            "batchsize is 1 and calibration iteration is 100.")
+                calib_func = default_calib_func
+            else:
+                calib_func = calib_func
+            model.config.torchscript = True
+            model = quantization.fit(
+                model,
+                conf,
+                calib_func=calib_func
+            ).model
+        return model
+
+
+class AutoModelForCausalLM(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_CAUSAL_LM_MAPPING
\ No newline at end of file
diff --git a/intel_extension_for_transformers/transformers/utils/__init__.py b/intel_extension_for_transformers/transformers/utils/__init__.py
index fa9f139b97d..1b574a0d3bf 100644
--- a/intel_extension_for_transformers/transformers/utils/__init__.py
+++ b/intel_extension_for_transformers/transformers/utils/__init__.py
@@ -15,4 +15,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Utils for optimization."""
\ No newline at end of file
+"""Utils for optimization."""
+
+from .quantization_config import (
+    AMPConfig,
+    BitsAndBytesConfig,
+    SmoothQuantConfig,
+    WeightOnlyQuantizationConfig,
+)
diff --git a/intel_extension_for_transformers/transformers/utils/utility.py b/intel_extension_for_transformers/transformers/utils/utility.py
index b7572fe69c4..82030c4a751 100644
--- a/intel_extension_for_transformers/transformers/utils/utility.py
+++ b/intel_extension_for_transformers/transformers/utils/utility.py
@@ -30,6 +30,7 @@
 DECODER_WITH_PAST_NAME = "decoder_with_past_model.bin"
 WEIGHTS_NAME = "pytorch_model.bin"
 
+torch = LazyImport("torch")
 
 def distributed_init(backend="gloo", world_size=1, rank=-1, init_method=None,
                      master_addr='127.0.0.1', master_port='12345'):
@@ -72,3 +73,53 @@ def __init__(self) -> None:
             self.batch_size = dataloader.total_batch_size
             self.dataset = dataloader.dataset
     return INCDataLoader()
+
+def generate_dummy_past_key_values(input_bs, model):
+    """
+        Generate the dummy past_key_values.
+    """
+    from optimum.utils import NormalizedConfigManager
+    normalized_config = NormalizedConfigManager.get_normalized_config_class(
+        model.config.model_type
+    )(model.config)
+    nb_pkv = 2
+    num_layers = normalized_config.num_layers
+    num_attention_heads = normalized_config.num_attention_heads
+    hidden_size = normalized_config.hidden_size
+    d_k = hidden_size // num_attention_heads
+
+    if model.config.model_type == "bloom":
+        pkv = ()
+        for nb_pkv in range(nb_pkv):
+            if nb_pkv % 2 == 0:
+                new_shape = [input_bs * num_attention_heads, d_k, 1]
+            else:
+                new_shape = [input_bs * num_attention_heads, 1, d_k]
+            pkv = pkv + (torch.ones(size=new_shape),)
+    else:
+        new_shape = [input_bs, num_attention_heads, 1, d_k]
+        dummy_tensor = torch.ones(size=new_shape)
+        pkv = tuple(dummy_tensor for _ in range(nb_pkv))
+    past_key_values = tuple(tuple(pkv) for _ in range(num_layers))
+    return past_key_values
+
+def get_example_inputs_for_trace(model, return_type="tuple"):
+    """
+        Generate the example_input for tracing, support models load from AutoModelForCausalLM.
+
+    """
+    input_ids = model.dummy_inputs["input_ids"]
+    input_bs, input_len = input_ids.shape
+    past_key_values = generate_dummy_past_key_values(input_bs, model)
+    attention_mask = torch.ones(input_bs, input_len + 1)
+    attention_mask[:,0] = 0
+    example_inputs = (input_ids, tuple(past_key_values), attention_mask)
+    # do inference to check example_inputs formats
+    model(*example_inputs)
+    if return_type != "tuple":
+        example_inputs = {
+            "input_ids": input_ids,
+            "past_key_values": tuple(past_key_values),
+            "attention_mask": attention_mask
+        }
+    return example_inputs
\ No newline at end of file

From 2dab42133be3569d745802eaeef45dafd9d10b8e Mon Sep 17 00:00:00 2001
From: changwangss <chang1.wang@intel.com>
Date: Fri, 15 Sep 2023 01:37:42 -0700
Subject: [PATCH 05/19] remove the outdated code

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 .../llm/quantization/optimization.py          | 152 +-----------------
 .../neural_chat/config.py                     |   8 +-
 .../transformers/utils/quantization_config.py |  43 +++++
 3 files changed, 47 insertions(+), 156 deletions(-)
 create mode 100644 intel_extension_for_transformers/transformers/utils/quantization_config.py

diff --git a/intel_extension_for_transformers/llm/quantization/optimization.py b/intel_extension_for_transformers/llm/quantization/optimization.py
index 07796efa16c..5db84390df2 100644
--- a/intel_extension_for_transformers/llm/quantization/optimization.py
+++ b/intel_extension_for_transformers/llm/quantization/optimization.py
@@ -15,32 +15,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from doctest import Example
 from typing import Union
-from venv import logger
-from intel_extension_for_transformers.transformers.utils.utility import LazyImport
 from intel_extension_for_transformers.neural_chat.config import (
     AMPConfig,
     WeightOnlyQuantizationConfig,
-    BitsAndBytesConfig,
-    SmoothQuantConfig
+    BitsAndBytesConfig
 )
-import logging
-import warnings
-logger = logging.getLogger(__name__)
-torch = LazyImport("torch")
 
 class Optimization:
     def __init__(
             self,
-            optimization_config: Union[AMPConfig, WeightOnlyQuantizationConfig, BitsAndBytesConfig, SmoothQuantConfig]
+            optimization_config: Union[AMPConfig, WeightOnlyQuantizationConfig, BitsAndBytesConfig]
         ):
         self.optimization_config = optimization_config
 
-    def optimize(self, model, tokenizer=None, calib_func=None):
-        """
-        Optimize the model with a given config.
-        """
+    def optimize(self, model):
         optimized_model = model
         config = self.optimization_config
         if isinstance(config, WeightOnlyQuantizationConfig):
@@ -66,139 +55,4 @@ def optimize(self, model, tokenizer=None, calib_func=None):
                 model,
                 conf,
             ).model
-        elif isinstance(config, SmoothQuantConfig):
-            print("Applying SmoothQuant.")
-            try:
-                import intel_extension_for_pytorch as ipex
-            except ImportError:
-                warnings.warn(
-                    "Please install Intel Extension for PyTorch to accelerate the model inference."
-                )
-            if tokenizer is None:
-                logger.error("Please provide the tokenizer or provide calib_func directly," + 
-                                " the following is how to get tokenizer. \n" +
-                                " from transformer import AutoTokenizer \n" +
-                                " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n"
-                                )
-                exit(0)
-            if calib_func is None:
-                from datasets import load_dataset
-                from torch.utils.data import DataLoader
-                calib_dataset = load_dataset("NeelNanda/pile-10k", split="train")
-                calib_dataset = calib_dataset.shuffle(seed=42)
-
-                def tokenize_function(examples):
-                    return tokenizer(examples["text"])
-
-                tokenized_dataset = calib_dataset.map(tokenize_function, batched=True)
-                tokenized_dataset.set_format(type="torch", columns=["input_ids"])
-
-                def collate_batch(batch):
-                    input_ids_padded = []
-                    for text in batch:
-                        input_ids = text["input_ids"]
-                        input_ids = (
-                                input_ids[: 512]
-                                if len(input_ids) > 512
-                                else input_ids
-                            )
-                        input_ids_padded.append(input_ids)
-                    return (torch.vstack(input_ids_padded))
-                calib_dataloader = DataLoader(
-                    tokenized_dataset,
-                    batch_size=1,
-                    shuffle=False,
-                    collate_fn=collate_batch,
-                )
-            def default_calib_func(model):
-                """
-                This is the default calibration function, the dataset is NeelNanda/pile-10k,
-                the default calib_iters is 100.
-                """
-
-                for i, (input_ids) in enumerate(calib_dataloader):
-                    input_bs, input_len = input_ids.shape
-                    past_key_values = self.generate_dummy_past_key_values(input_bs, model)
-                    attention_mask = torch.ones(input_bs, input_len + 1)
-                    attention_mask[:,0] = 0
-                    if i >= 100:
-                        break
-                    model(
-                        input_ids=input_ids,
-                        past_key_values=past_key_values,
-                        attention_mask=attention_mask,
-                    )
-            recipes = {"smooth_quant": True, "smooth_quant_args": {"alpha": config.alpha}}
-            example_inputs = self.get_example_inputs_for_trace(model)
-            from neural_compressor import PostTrainingQuantConfig, quantization
-            conf = PostTrainingQuantConfig(
-                backend="ipex",
-                excluded_precisions=config.excluded_precisions,
-                op_type_dict=config.op_type_dict,
-                recipes=recipes,
-                example_inputs=example_inputs,
-            )
-            if calib_func is None:
-                logger.info("The default calibration funcation is used, " +
-                            "the calibration dataset is NeelNanda/pile-10k," +
-                            "batchsize is 1 and calibration iteration is 100.")
-                calib_func = default_calib_func
-            else:
-                calib_func = calib_func
-            optimized_model = quantization.fit(
-                model,
-                conf,
-                calib_func=calib_func,
-            )
         return optimized_model
-
-    def generate_dummy_past_key_values(self, input_bs, model):
-        """
-            Generate the dummy past_key_values.
-        """
-        from optimum.utils import NormalizedConfigManager
-        normalized_config = NormalizedConfigManager.get_normalized_config_class(
-            model.config.model_type
-        )(model.config)
-        nb_pkv = 2
-        num_layers = normalized_config.num_layers
-        num_attention_heads = normalized_config.num_attention_heads
-        hidden_size = normalized_config.hidden_size
-        d_k = hidden_size // num_attention_heads
-
-        if model.config.model_type == "bloom":
-            pkv = ()
-            for nb_pkv in range(nb_pkv):
-                if nb_pkv % 2 == 0:
-                    new_shape = [input_bs * num_attention_heads, d_k, 1]
-                else:
-                    new_shape = [input_bs * num_attention_heads, 1, d_k]
-                pkv = pkv + (torch.ones(size=new_shape),)
-        else:
-            new_shape = [input_bs, num_attention_heads, 1, d_k]
-            dummy_tensor = torch.ones(size=new_shape)
-            pkv = tuple(dummy_tensor for _ in range(nb_pkv))
-        past_key_values = tuple(tuple(pkv) for _ in range(num_layers))
-        return past_key_values
-
-    def get_example_inputs_for_trace(self, model, return_type="tuple"):
-        """
-            Generate the example_input for tracing, support models load from AutoModelForCausalLM.
-
-        """
-        input_ids = model.dummy_inputs["input_ids"]
-        input_bs, input_len = input_ids.shape
-        past_key_values = self.generate_dummy_past_key_values(input_bs, model)
-        attention_mask = torch.ones(input_bs, input_len + 1)
-        attention_mask[:,0] = 0
-        example_inputs = (input_ids, tuple(past_key_values), attention_mask)
-        # do inference to check example_inputs formats
-        model(*example_inputs)
-        if return_type != "tuple":
-            example_inputs = {
-                "input_ids": input_ids,
-                "past_key_values": tuple(past_key_values),
-                "attention_mask": attention_mask
-            }
-        return example_inputs
-
diff --git a/intel_extension_for_transformers/neural_chat/config.py b/intel_extension_for_transformers/neural_chat/config.py
index 13e0c3494db..7248dc0bcb7 100644
--- a/intel_extension_for_transformers/neural_chat/config.py
+++ b/intel_extension_for_transformers/neural_chat/config.py
@@ -416,13 +416,7 @@ class WeightOnlyQuantizationConfig:
 
 @dataclass
 class AMPConfig:
-    dtype: str = 'bfloat16'    
-
-@dataclass
-class SmoothQuantConfig:
-    alpha: float = 0.5
-    op_type_dict: dict = None  
-    excluded_precisions: dict = None
+    dtype: str = 'bfloat16'
 
 class PipelineConfig:
     def __init__(self,
diff --git a/intel_extension_for_transformers/transformers/utils/quantization_config.py b/intel_extension_for_transformers/transformers/utils/quantization_config.py
new file mode 100644
index 00000000000..512322825dc
--- /dev/null
+++ b/intel_extension_for_transformers/transformers/utils/quantization_config.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Configs for intel extension for transformers."""
+
+from dataclasses import dataclass
+from typing import Optional, Any
+from transformers import BitsAndBytesConfig
+
+
+@dataclass
+class WeightOnlyQuantizationConfig:
+    algorithm: str = 'RTN'
+    bits: int = 8
+    group_size: int = -1
+    scheme: str = 'sym'
+    enable_full_range: bool = True
+
+@dataclass
+class AMPConfig:
+    dtype: str = 'bfloat16'    
+
+@dataclass
+class SmoothQuantConfig:
+    tokenizer: Any = None
+    calib_func: Any = None
+    alpha: float = 0.5
+    op_type_dict: dict = None  
+    excluded_precisions: dict = None
+

From 7ba2aede6a5b3688231dc6268a14195e6d91cb37 Mon Sep 17 00:00:00 2001
From: "Wang, Chang" <chang1.wang@intel.com>
Date: Fri, 15 Sep 2023 16:46:26 +0800
Subject: [PATCH 06/19] update order

---
 intel_extension_for_transformers/transformers/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/intel_extension_for_transformers/transformers/__init__.py b/intel_extension_for_transformers/transformers/__init__.py
index fbec8b11cb7..8cf610f1015 100644
--- a/intel_extension_for_transformers/transformers/__init__.py
+++ b/intel_extension_for_transformers/transformers/__init__.py
@@ -23,7 +23,6 @@
 from .distillation import (SUPPORTED_DISTILLATION_CRITERION_MODE,
                            DistillationCriterionMode)
 from .mixture.auto_distillation import AutoDistillation
-from .modeling import AutoModelForCausalLM, OptimizedModel
 from .nas import NAS
 from .optimizer import NoTrainerOptimizer, Orchestrate_optimizer
 from .optimizer_tf import TFOptimization
@@ -32,3 +31,4 @@
 from .utils import (AMPConfig, BitsAndBytesConfig, SmoothQuantConfig,
                     WeightOnlyQuantizationConfig, metrics, objectives)
 from .utils.utility import LazyImport
+from .modeling import AutoModelForCausalLM, OptimizedModel

From 205f8eccf248b57e1d6dfd63e371f7b0c05bc459 Mon Sep 17 00:00:00 2001
From: changwangss <chang1.wang@intel.com>
Date: Fri, 15 Sep 2023 04:04:44 -0700
Subject: [PATCH 07/19] improve sqconfig and add ut

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 .../transformers/modeling/modeling_causal.py  | 18 ++++++--
 .../transformers/utils/quantization_config.py | 19 ++++----
 tests/test_quantization.py                    | 46 +++++++++++++++++++
 3 files changed, 72 insertions(+), 11 deletions(-)

diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_causal.py b/intel_extension_for_transformers/transformers/modeling/modeling_causal.py
index e090d092ad2..92686191a28 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_causal.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_causal.py
@@ -176,11 +176,23 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             if calib_func is None:
                 from datasets import load_dataset
                 from torch.utils.data import DataLoader
-                calib_dataset = load_dataset("NeelNanda/pile-10k", split="train")
+                calib_dataset = quantization_config.calib_dataset
+                calib_iters = quantization_config.calib_iters
+                calib_dataset = load_dataset(calib_dataset, split="train")
                 calib_dataset = calib_dataset.shuffle(seed=42)
 
                 def tokenize_function(examples):
-                    return quantization_config.tokenizer(examples["text"])
+                    if 'prompt' in examples:
+                        example = quantization_config.tokenizer(examples["prompt"])
+                    elif 'text' in examples:
+                        example = quantization_config.tokenizer(examples["text"])
+                    elif 'code' in examples:
+                        example = quantization_config.tokenizer(examples["code"])
+                    else:
+                        logger.error("Please check dataset prompt identifier," +
+                                     " NeelNanda/pile-10k is default used calibration dataset.")
+                        exit(0)
+                    return example
 
                 tokenized_dataset = calib_dataset.map(tokenize_function, batched=True)
                 tokenized_dataset.set_format(type="torch", columns=["input_ids"])
@@ -213,7 +225,7 @@ def default_calib_func(model):
                     past_key_values = generate_dummy_past_key_values(input_bs, model)
                     attention_mask = torch.ones(input_bs, input_len + 1)
                     attention_mask[:,0] = 0
-                    if i >= 100:
+                    if i >= calib_iters:
                         break
                     model(
                         input_ids=input_ids,
diff --git a/intel_extension_for_transformers/transformers/utils/quantization_config.py b/intel_extension_for_transformers/transformers/utils/quantization_config.py
index 512322825dc..1c590609055 100644
--- a/intel_extension_for_transformers/transformers/utils/quantization_config.py
+++ b/intel_extension_for_transformers/transformers/utils/quantization_config.py
@@ -16,28 +16,31 @@
 # limitations under the License.
 """Configs for intel extension for transformers."""
 
-from dataclasses import dataclass
-from typing import Optional, Any
+from dataclasses import dataclass, field
+from typing import Any, Optional
+
 from transformers import BitsAndBytesConfig
 
 
 @dataclass
 class WeightOnlyQuantizationConfig:
-    algorithm: str = 'RTN'
+    algorithm: str = "RTN"
     bits: int = 8
     group_size: int = -1
-    scheme: str = 'sym'
+    scheme: str = "sym"
     enable_full_range: bool = True
 
+
 @dataclass
 class AMPConfig:
-    dtype: str = 'bfloat16'    
+    dtype: str = "bfloat16"
 
 @dataclass
 class SmoothQuantConfig:
     tokenizer: Any = None
     calib_func: Any = None
+    calib_dataset: str = "NeelNanda/pile-10k"
+    calib_iters: int = 100
     alpha: float = 0.5
-    op_type_dict: dict = None  
-    excluded_precisions: dict = None
-
+    op_type_dict: dict = None
+    excluded_precisions: list = field(default_factory=list)
diff --git a/tests/test_quantization.py b/tests/test_quantization.py
index 08aa2e504bc..49031214936 100644
--- a/tests/test_quantization.py
+++ b/tests/test_quantization.py
@@ -286,6 +286,52 @@ def test_bf16_onnx(self):
                 self.assertEqual(tensor.data_type, TensorProto.BFLOAT16)
                 break
 
+    def test_quantization_for_llm(self):
+        model_name_or_path = "facebook/opt-125m"
+        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+        from intel_extension_for_transformers.transformers import (
+            AMPConfig,
+            WeightOnlyQuantizationConfig,
+            SmoothQuantConfig,
+            BitsAndBytesConfig
+
+        ) 
+        from intel_extension_for_transformers.transformers import AutoModelForCausalLM
+        fp32_model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
+        dummy_input = fp32_model.dummy_inputs["input_ids"]
+
+        # smooth-quant
+        sq_config = SmoothQuantConfig(
+                                    tokenizer=tokenizer,  # either two of one, tokenizer or calib_func
+                                    calib_iters=5
+                                )
+        q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
+                                                    quantization_config=sq_config
+                                                )
+        self.assertTrue(isinstance(q_model, torch.jit.ScriptModule))
+        # weight-only
+        woq_config = WeightOnlyQuantizationConfig()
+        woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
+                                                    quantization_config=woq_config
+                                                )
+        output = woq_model(dummy_input)
+        self.assertTrue(float(output[0][0][0][0]), -7.139640808105469)
+        # amp
+        amp_config = AMPConfig() 
+        amp_model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
+                                                    quantization_config=amp_config
+                                                )
+        output = amp_model(dummy_input)
+        self.assertTrue(float(output[0][0][0][0]), -7.347761154174805)
+        
+    
+        # bitsandbytes
+        bab_config = BitsAndBytesConfig()
+        bab_model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
+                                                    quantization_config=bab_config
+                                                )
+        output = bab_model(dummy_input)
+        self.assertTrue(float(output[0][0][0][0]), -7.347761154174805)
 
 if __name__ == "__main__":
     unittest.main()

From 3b234777ed83c59924870a267fcbfc313cc23dbe Mon Sep 17 00:00:00 2001
From: changwangss <chang1.wang@intel.com>
Date: Sun, 17 Sep 2023 23:11:28 -0700
Subject: [PATCH 08/19] refine woq

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 .../llm/quantization/utils.py                 |   2 +-
 .../transformers/__init__.py                  |  41 ++-
 .../transformers/modeling/__init__.py         |   3 +-
 .../transformers/modeling/modeling_auto.py    | 327 ++++++++++++++----
 .../transformers/modeling/modeling_causal.py  | 262 --------------
 .../transformers/utils/__init__.py            |   2 +-
 .../transformers/utils/quantization_config.py | 203 ++++++++++-
 tests/test_quantization.py                    |  20 +-
 8 files changed, 492 insertions(+), 368 deletions(-)
 delete mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_causal.py

diff --git a/intel_extension_for_transformers/llm/quantization/utils.py b/intel_extension_for_transformers/llm/quantization/utils.py
index 31f86efef90..f20105c8a55 100644
--- a/intel_extension_for_transformers/llm/quantization/utils.py
+++ b/intel_extension_for_transformers/llm/quantization/utils.py
@@ -184,7 +184,7 @@ def convert_to_quantized_model(model, config):
                     "dtype":dtype,
                     "group_size": config.group_size,  # -1 (per-channel)
                     "scheme": config.scheme,
-                    "algorithm": "RTN", 
+                    "algorithm": config.algorithm, 
                 },
             },
         },
diff --git a/intel_extension_for_transformers/transformers/__init__.py b/intel_extension_for_transformers/transformers/__init__.py
index 8cf610f1015..18ce4ef6e87 100644
--- a/intel_extension_for_transformers/transformers/__init__.py
+++ b/intel_extension_for_transformers/transformers/__init__.py
@@ -16,19 +16,42 @@
 # limitations under the License.
 
 
-from .config import (WEIGHTS_NAME, AutoDistillationConfig, BenchmarkConfig,
-                     DistillationConfig, DynamicLengthConfig,
-                     FlashDistillationConfig, NASConfig, Provider, PrunerV2,
-                     PruningConfig, QuantizationConfig, TFDistillationConfig)
-from .distillation import (SUPPORTED_DISTILLATION_CRITERION_MODE,
-                           DistillationCriterionMode)
+from .config import (
+    WEIGHTS_NAME,
+    AutoDistillationConfig,
+    BenchmarkConfig,
+    DistillationConfig,
+    DynamicLengthConfig,
+    FlashDistillationConfig,
+    NASConfig,
+    Provider,
+    PrunerV2,
+    PruningConfig,
+    QuantizationConfig,
+    TFDistillationConfig,
+)
+from .distillation import (
+    SUPPORTED_DISTILLATION_CRITERION_MODE,
+    DistillationCriterionMode,
+)
 from .mixture.auto_distillation import AutoDistillation
 from .nas import NAS
 from .optimizer import NoTrainerOptimizer, Orchestrate_optimizer
 from .optimizer_tf import TFOptimization
 from .pruning import SUPPORTED_PRUNING_MODE, PrunerConfig, PruningMode
 from .quantization import SUPPORTED_QUANT_MODE, QuantizationMode
-from .utils import (AMPConfig, BitsAndBytesConfig, SmoothQuantConfig,
-                    WeightOnlyQuantizationConfig, metrics, objectives)
+from .utils import (
+    AMPConfig,
+    BitsAndBytesConfig,
+    SmoothQuantConfig,
+    WeightOnlyQuantConfig,
+    metrics,
+    objectives,
+)
 from .utils.utility import LazyImport
-from .modeling import AutoModelForCausalLM, OptimizedModel
+from .modeling import (
+    AutoModelForCausalLM,
+    AutoModel,
+    AutoModelForSeq2SeqLM,
+    OptimizedModel,
+)
diff --git a/intel_extension_for_transformers/transformers/modeling/__init__.py b/intel_extension_for_transformers/transformers/modeling/__init__.py
index e8353a5ea25..5fb99065b03 100644
--- a/intel_extension_for_transformers/transformers/modeling/__init__.py
+++ b/intel_extension_for_transformers/transformers/modeling/__init__.py
@@ -19,4 +19,5 @@
 
 
 from .model import OptimizedModel
-from .modeling_auto import AutoModelForCausalLM
+from .modeling_auto import (AutoModel, AutoModelForCausalLM,
+                            AutoModelForSeq2SeqLM)
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
index 92e23893449..517c91371fc 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
@@ -1,75 +1,252 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import logging
-import torch
-import transformers
-
-
-logger = logging.getLogger(__name__)
-
-
-class _BaseQBitsAutoModelClass:
-    ORIG_MODEL = None
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        import intel_extension_for_transformers.transformers.modeling.modeling_map
-        load_in_8bit = kwargs.pop("load_in_8bit", False)
-        load_in_4bit = kwargs.pop("load_in_4bit", False)
-        quantization_config = kwargs.pop("quantization_config", None)
-        if load_in_8bit or load_in_4bit or quantization_config is not None:
-            from ...llm.quantization.config import WeightOnlyConfig
-            from ...llm.quantization.utils import convert_to_quantized_model, convert_dtype_2_str
-            torch_dtype = kwargs.pop("torch_dtype", torch.float32)
-        if load_in_4bit:
-            if quantization_config is None:
-                quantization_config = WeightOnlyConfig(compute_dtype=torch_dtype, weight_dtype="nf4")
-            else:
-                assert "4" in quantization_config.weight_dtype and quantization_config.compute_dtype == torch_dtype, \
-                f"Quantization_config.weight_dtype should be 'nf4', 'int4_fullrange', 'int4_clip',"
-                f"'fp4_e2m1' or 'fp4_e2m1_bnb' and compute_dtype should be {torch_dtype}."
-        elif load_in_8bit:
-            if quantization_config is None:
-                quantization_config = WeightOnlyConfig(compute_dtype=torch_dtype, weight_dtype="int8")
-            else:
-                assert quantization_config.weight_dtype == "int8" \
-                    and quantization_config.compute_dtype == torch_dtype, \
-                        f"Quantization_config.weight_dtype should be 'int8' and compute_dtype should be {torch_dtype}."
-        elif quantization_config is not None:
-            if quantization_config.compute_dtype != convert_dtype_2_str(torch_dtype):
-                logger.warning(f"Quantization_config.compute_dtype should be align with {torch_dtype}.")
-
-        model = cls.ORIG_MODEL.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        if quantization_config is not None:
-            return convert_to_quantized_model(model, quantization_config)
-        else:
-            return model
-
-
-class AutoModelForCausalLM(_BaseQBitsAutoModelClass):
-    ORIG_MODEL = transformers.AutoModelForCausalLM
-
-
-class AutoModel(_BaseQBitsAutoModelClass):
-    ORIG_MODEL = transformers.AutoModel
-
-
-class AutoModelForSeq2SeqLM(_BaseQBitsAutoModelClass):
-    ORIG_MODEL = transformers.AutoModelForSeq2SeqLM
-
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# coding=utf-8
+# Copyright 2021 The EleutherAI and HuggingFace Teams. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+
+from transformers import AutoConfig, PretrainedConfig
+from transformers.dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
+from transformers.models.auto.modeling_auto import (MODEL_FOR_CAUSAL_LM_MAPPING,
+                                                    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
+                                                    MODEL_MAPPING
+                                                    )
+
+from transformers.models.auto.auto_factory import _get_model_class
+from intel_extension_for_transformers.transformers.utils.utility import (
+    LazyImport,
+    generate_dummy_past_key_values,
+    get_example_inputs_for_trace
+)
+
+
+from intel_extension_for_transformers.transformers import (
+    AMPConfig,
+    WeightOnlyQuantConfig,
+    SmoothQuantConfig
+)
+import logging
+import warnings
+logger = logging.getLogger(__name__)
+torch = LazyImport("torch")
+
+
+class _BaseAutoModelClass:
+    # Base class for auto models.
+    _model_mapping = None
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        import intel_extension_for_transformers.transformers.modeling.modeling_map
+        config = kwargs.pop("config", None)
+        calib_func = kwargs.pop("calib_func", None)
+        trust_remote_code = kwargs.pop("trust_remote_code", None)
+        kwargs["_from_auto"] = True
+        hub_kwargs_names = [
+            "cache_dir",
+            "code_revision",
+            "force_download",
+            "local_files_only",
+            "proxies",
+            "resume_download",
+            "revision",
+            "subfolder",
+            "use_auth_token",
+        ]
+        hub_kwargs = {name: kwargs.pop(name) for name in hub_kwargs_names if name in kwargs}
+
+        if not isinstance(config, PretrainedConfig):
+            kwargs_orig = copy.deepcopy(kwargs)
+            # ensure not to pollute the config object with torch_dtype="auto" - since it's
+            # meaningless in the context of the config object - torch.dtype values are acceptable
+            if kwargs.get("torch_dtype", None) == "auto":
+                _ = kwargs.pop("torch_dtype")
+            # to not overwrite the quantization_config if config has a quantization_config
+
+            if kwargs.get("quantization_config", None) is not None:
+                _ = kwargs.pop("quantization_config")
+
+            config, kwargs = AutoConfig.from_pretrained(
+                pretrained_model_name_or_path,
+                return_unused_kwargs=True,
+                trust_remote_code=trust_remote_code,
+                **hub_kwargs,
+                **kwargs,
+            )
+
+            # if torch_dtype=auto was passed here, ensure to pass it on
+            if kwargs_orig.get("torch_dtype", None) == "auto":
+                kwargs["torch_dtype"] = "auto"
+            quantization_config = kwargs_orig.get("quantization_config", None)
+            if quantization_config is not None and not (isinstance(quantization_config, SmoothQuantConfig) or 
+                                                        isinstance(quantization_config, AMPConfig) or
+                                                        isinstance(quantization_config, WeightOnlyQuantConfig)
+                                                        ):
+                kwargs["quantization_config"] = kwargs_orig["quantization_config"]
+            if isinstance(quantization_config, AMPConfig):
+                config.torch_dtype=torch.bfloat16
+
+        has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map
+        has_local_code = type(config) in cls._model_mapping.keys()
+        trust_remote_code = resolve_trust_remote_code(
+            trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
+        )
+        if has_remote_code and trust_remote_code:
+            class_ref = config.auto_map[cls.__name__]
+            model_class = get_class_from_dynamic_module(
+                class_ref, pretrained_model_name_or_path, **hub_kwargs, **kwargs
+            )
+            _ = hub_kwargs.pop("code_revision", None)
+            model = model_class.from_pretrained(
+                pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
+            )
+        elif type(config) in cls._model_mapping.keys():
+            model_class = _get_model_class(config, cls._model_mapping)
+            model =  model_class.from_pretrained(
+                pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
+            )
+        else:
+            raise ValueError(
+                f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
+                f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
+            )
+        model.eval()
+        if isinstance(quantization_config, WeightOnlyQuantConfig):
+            logger.info("Applying Weight Only Quantization.")
+            from intel_extension_for_transformers.llm.quantization.utils import convert_to_quantized_model
+            convert_to_quantized_model(model, quantization_config)
+        elif isinstance(quantization_config, SmoothQuantConfig):
+            logger.info("Applying SmoothQuant.")
+            try:
+                import intel_extension_for_pytorch as ipex
+            except ImportError:
+                warnings.warn(
+                    "Please install Intel Extension for PyTorch to accelerate the model inference."
+                )
+            if quantization_config.tokenizer is None:
+                logger.error("Please provide the tokenizer or provide calib_func directly," + 
+                                " the following is how to get tokenizer. \n" +
+                                " from transformer import AutoTokenizer \n" +
+                                " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n"
+                                )
+                exit(0)
+            if calib_func is None:
+                from datasets import load_dataset
+                from torch.utils.data import DataLoader
+                calib_dataset = quantization_config.calib_dataset
+                calib_iters = quantization_config.calib_iters
+                calib_dataset = load_dataset(calib_dataset, split="train")
+                calib_dataset = calib_dataset.shuffle(seed=42)
+
+                def tokenize_function(examples):
+                    if 'prompt' in examples:
+                        example = quantization_config.tokenizer(examples["prompt"])
+                    elif 'text' in examples:
+                        example = quantization_config.tokenizer(examples["text"])
+                    elif 'code' in examples:
+                        example = quantization_config.tokenizer(examples["code"])
+                    else:
+                        logger.error("Please check dataset prompt identifier," +
+                                     " NeelNanda/pile-10k is default used calibration dataset.")
+                        exit(0)
+                    return example
+
+                tokenized_dataset = calib_dataset.map(tokenize_function, batched=True)
+                tokenized_dataset.set_format(type="torch", columns=["input_ids"])
+
+                def collate_batch(batch):
+                    input_ids_padded = []
+                    for text in batch:
+                        input_ids = text["input_ids"]
+                        input_ids = (
+                                input_ids[: 512]
+                                if len(input_ids) > 512
+                                else input_ids
+                            )
+                        input_ids_padded.append(input_ids)
+                    return (torch.vstack(input_ids_padded))
+                calib_dataloader = DataLoader(
+                    tokenized_dataset,
+                    batch_size=1,
+                    shuffle=False,
+                    collate_fn=collate_batch,
+                )
+            def default_calib_func(model):
+                """
+                This is the default calibration function, the dataset is NeelNanda/pile-10k,
+                the default calib_iters is 100.
+                """
+
+                for i, (input_ids) in enumerate(calib_dataloader):
+                    input_bs, input_len = input_ids.shape
+                    past_key_values = generate_dummy_past_key_values(input_bs, model)
+                    attention_mask = torch.ones(input_bs, input_len + 1)
+                    attention_mask[:,0] = 0
+                    if i >= calib_iters:
+                        break
+                    model(
+                        input_ids=input_ids,
+                        past_key_values=past_key_values,
+                        attention_mask=attention_mask,
+                    )
+            recipes = {"smooth_quant": True, "smooth_quant_args": {"alpha": quantization_config.alpha}}
+            example_inputs = get_example_inputs_for_trace(model)
+            from neural_compressor import PostTrainingQuantConfig, quantization
+            conf = PostTrainingQuantConfig(
+                backend="ipex",
+                excluded_precisions=quantization_config.excluded_precisions,
+                op_type_dict=quantization_config.op_type_dict,
+                recipes=recipes,
+                example_inputs=example_inputs,
+            )
+            if calib_func is None:
+                logger.info("The default calibration funcation is used, " +
+                            "the calibration dataset is NeelNanda/pile-10k," +
+                            "batchsize is 1 and calibration iteration is 100.")
+                calib_func = default_calib_func
+            else:
+                calib_func = calib_func
+            model.config.torchscript = True
+            model = quantization.fit(
+                model,
+                conf,
+                calib_func=calib_func
+            ).model
+        return model
+
+
+class AutoModelForCausalLM(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_CAUSAL_LM_MAPPING
+
+class AutoModel(_BaseAutoModelClass):
+    _model_mapping = MODEL_MAPPING
+
+class AutoModelForSeq2SeqLM(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_causal.py b/intel_extension_for_transformers/transformers/modeling/modeling_causal.py
deleted file mode 100644
index 92686191a28..00000000000
--- a/intel_extension_for_transformers/transformers/modeling/modeling_causal.py
+++ /dev/null
@@ -1,262 +0,0 @@
-# !/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# coding=utf-8
-# Copyright 2021 The EleutherAI and HuggingFace Teams. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-
-from transformers import AutoConfig, PretrainedConfig
-from transformers.dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
-from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING
-from transformers.models.auto.auto_factory import _get_model_class
-from intel_extension_for_transformers.transformers.utils.utility import (
-    LazyImport,
-    generate_dummy_past_key_values,
-    get_example_inputs_for_trace
-)
-
-
-from intel_extension_for_transformers.transformers import (
-    AMPConfig,
-    WeightOnlyQuantizationConfig,
-    SmoothQuantConfig
-)
-import logging
-import warnings
-logger = logging.getLogger(__name__)
-torch = LazyImport("torch")
-
-
-class _BaseAutoModelClass:
-    # Base class for auto models.
-    _model_mapping = None
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        import intel_extension_for_transformers.transformers.modeling.modeling_map
-        config = kwargs.pop("config", None)
-        calib_func = kwargs.pop("calib_func", None)
-        trust_remote_code = kwargs.pop("trust_remote_code", None)
-        kwargs["_from_auto"] = True
-        hub_kwargs_names = [
-            "cache_dir",
-            "code_revision",
-            "force_download",
-            "local_files_only",
-            "proxies",
-            "resume_download",
-            "revision",
-            "subfolder",
-            "use_auth_token",
-        ]
-        hub_kwargs = {name: kwargs.pop(name) for name in hub_kwargs_names if name in kwargs}
-
-        if not isinstance(config, PretrainedConfig):
-            kwargs_orig = copy.deepcopy(kwargs)
-            # ensure not to pollute the config object with torch_dtype="auto" - since it's
-            # meaningless in the context of the config object - torch.dtype values are acceptable
-            if kwargs.get("torch_dtype", None) == "auto":
-                _ = kwargs.pop("torch_dtype")
-            # to not overwrite the quantization_config if config has a quantization_config
-
-            if kwargs.get("quantization_config", None) is not None:
-                _ = kwargs.pop("quantization_config")
-
-            config, kwargs = AutoConfig.from_pretrained(
-                pretrained_model_name_or_path,
-                return_unused_kwargs=True,
-                trust_remote_code=trust_remote_code,
-                **hub_kwargs,
-                **kwargs,
-            )
-
-            # if torch_dtype=auto was passed here, ensure to pass it on
-            if kwargs_orig.get("torch_dtype", None) == "auto":
-                kwargs["torch_dtype"] = "auto"
-            quantization_config = kwargs_orig.get("quantization_config", None)
-            if quantization_config is not None and not (isinstance(quantization_config, SmoothQuantConfig) or 
-                                                        isinstance(quantization_config, AMPConfig) or
-                                                        isinstance(quantization_config, WeightOnlyQuantizationConfig)
-                                                        ):
-                kwargs["quantization_config"] = kwargs_orig["quantization_config"]
-            if isinstance(quantization_config, AMPConfig):
-                config.torch_dtype=torch.bfloat16
-
-        has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map
-        has_local_code = type(config) in cls._model_mapping.keys()
-        trust_remote_code = resolve_trust_remote_code(
-            trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
-        )
-        if has_remote_code and trust_remote_code:
-            class_ref = config.auto_map[cls.__name__]
-            model_class = get_class_from_dynamic_module(
-                class_ref, pretrained_model_name_or_path, **hub_kwargs, **kwargs
-            )
-            _ = hub_kwargs.pop("code_revision", None)
-            model = model_class.from_pretrained(
-                pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
-            )
-        elif type(config) in cls._model_mapping.keys():
-            model_class = _get_model_class(config, cls._model_mapping)
-            model =  model_class.from_pretrained(
-                pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
-            )
-        else:
-            raise ValueError(
-                f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
-                f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
-            )
-        model.eval()
-        if isinstance(quantization_config, WeightOnlyQuantizationConfig):
-            logger.info("Applying Weight Only Quantization.")
-            from neural_compressor import PostTrainingQuantConfig, quantization
-            op_type_dict = {
-                '.*':{ 	# re.match
-                    "weight": {
-                        'bits': quantization_config.bits, # 1-8 bits
-                        'group_size': quantization_config.group_size,  # -1 (per-channel)
-                        'scheme': quantization_config.scheme, # sym/asym
-                        'algorithm': quantization_config.algorithm, # RTN/AWQ/TEQ
-                    },
-                },
-            }
-            recipes = {"rtn_args": {"enable_full_range": quantization_config.enable_full_range}}
-            conf = PostTrainingQuantConfig(
-                approach='weight_only',
-                op_type_dict=op_type_dict,
-                recipes=recipes,
-            )
-            model.config.torchscript = True
-            model = quantization.fit(
-                model,
-                conf,
-            ).model
-        elif isinstance(quantization_config, SmoothQuantConfig):
-            logger.info("Applying SmoothQuant.")
-            try:
-                import intel_extension_for_pytorch as ipex
-            except ImportError:
-                warnings.warn(
-                    "Please install Intel Extension for PyTorch to accelerate the model inference."
-                )
-            if quantization_config.tokenizer is None:
-                logger.error("Please provide the tokenizer or provide calib_func directly," + 
-                                " the following is how to get tokenizer. \n" +
-                                " from transformer import AutoTokenizer \n" +
-                                " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n"
-                                )
-                exit(0)
-            if calib_func is None:
-                from datasets import load_dataset
-                from torch.utils.data import DataLoader
-                calib_dataset = quantization_config.calib_dataset
-                calib_iters = quantization_config.calib_iters
-                calib_dataset = load_dataset(calib_dataset, split="train")
-                calib_dataset = calib_dataset.shuffle(seed=42)
-
-                def tokenize_function(examples):
-                    if 'prompt' in examples:
-                        example = quantization_config.tokenizer(examples["prompt"])
-                    elif 'text' in examples:
-                        example = quantization_config.tokenizer(examples["text"])
-                    elif 'code' in examples:
-                        example = quantization_config.tokenizer(examples["code"])
-                    else:
-                        logger.error("Please check dataset prompt identifier," +
-                                     " NeelNanda/pile-10k is default used calibration dataset.")
-                        exit(0)
-                    return example
-
-                tokenized_dataset = calib_dataset.map(tokenize_function, batched=True)
-                tokenized_dataset.set_format(type="torch", columns=["input_ids"])
-
-                def collate_batch(batch):
-                    input_ids_padded = []
-                    for text in batch:
-                        input_ids = text["input_ids"]
-                        input_ids = (
-                                input_ids[: 512]
-                                if len(input_ids) > 512
-                                else input_ids
-                            )
-                        input_ids_padded.append(input_ids)
-                    return (torch.vstack(input_ids_padded))
-                calib_dataloader = DataLoader(
-                    tokenized_dataset,
-                    batch_size=1,
-                    shuffle=False,
-                    collate_fn=collate_batch,
-                )
-            def default_calib_func(model):
-                """
-                This is the default calibration function, the dataset is NeelNanda/pile-10k,
-                the default calib_iters is 100.
-                """
-
-                for i, (input_ids) in enumerate(calib_dataloader):
-                    input_bs, input_len = input_ids.shape
-                    past_key_values = generate_dummy_past_key_values(input_bs, model)
-                    attention_mask = torch.ones(input_bs, input_len + 1)
-                    attention_mask[:,0] = 0
-                    if i >= calib_iters:
-                        break
-                    model(
-                        input_ids=input_ids,
-                        past_key_values=past_key_values,
-                        attention_mask=attention_mask,
-                    )
-            recipes = {"smooth_quant": True, "smooth_quant_args": {"alpha": quantization_config.alpha}}
-            example_inputs = get_example_inputs_for_trace(model)
-            from neural_compressor import PostTrainingQuantConfig, quantization
-            conf = PostTrainingQuantConfig(
-                backend="ipex",
-                excluded_precisions=quantization_config.excluded_precisions,
-                op_type_dict=quantization_config.op_type_dict,
-                recipes=recipes,
-                example_inputs=example_inputs,
-            )
-            if calib_func is None:
-                logger.info("The default calibration funcation is used, " +
-                            "the calibration dataset is NeelNanda/pile-10k," +
-                            "batchsize is 1 and calibration iteration is 100.")
-                calib_func = default_calib_func
-            else:
-                calib_func = calib_func
-            model.config.torchscript = True
-            model = quantization.fit(
-                model,
-                conf,
-                calib_func=calib_func
-            ).model
-        return model
-
-
-class AutoModelForCausalLM(_BaseAutoModelClass):
-    _model_mapping = MODEL_FOR_CAUSAL_LM_MAPPING
\ No newline at end of file
diff --git a/intel_extension_for_transformers/transformers/utils/__init__.py b/intel_extension_for_transformers/transformers/utils/__init__.py
index 1b574a0d3bf..c4eae8a076a 100644
--- a/intel_extension_for_transformers/transformers/utils/__init__.py
+++ b/intel_extension_for_transformers/transformers/utils/__init__.py
@@ -21,5 +21,5 @@
     AMPConfig,
     BitsAndBytesConfig,
     SmoothQuantConfig,
-    WeightOnlyQuantizationConfig,
+    WeightOnlyQuantConfig,
 )
diff --git a/intel_extension_for_transformers/transformers/utils/quantization_config.py b/intel_extension_for_transformers/transformers/utils/quantization_config.py
index 1c590609055..9653c1a0dbf 100644
--- a/intel_extension_for_transformers/transformers/utils/quantization_config.py
+++ b/intel_extension_for_transformers/transformers/utils/quantization_config.py
@@ -16,19 +16,204 @@
 # limitations under the License.
 """Configs for intel extension for transformers."""
 
+import copy
+import json
+import os
 from dataclasses import dataclass, field
-from typing import Any, Optional
-
+from typing import Any, Optional, Dict, Union
+from .utility import LazyImport
 from transformers import BitsAndBytesConfig
+from intel_extension_for_transformers.llm.quantization.utils import convert_dtype_2_str
+torch = LazyImport("torch")
 
+class WeightOnlyQuantConfig:
+    def __init__(
+        self,
+        llm_int8_skip_modules=None,
+        compute_dtype=None,
+        weight_dtype="int4_fullrange", # int8 int4_clip, int4_fullrange fp4_e2m1_bnb fp4_e2m1 nf4
+        scale_dtype="fp32", # Now only fp32
+        mse_range=False,
+        use_double_quant=False,
+        double_quant_dtype="int8", # reserve for double quant
+        double_quant_scale_dtype="fp32", # reserve for double quant
+        group_size=None,
+        scheme="sym",
+        algorithm="RTN",
+        **kwargs,
+    ):
+        self.llm_int8_skip_modules = llm_int8_skip_modules if llm_int8_skip_modules else []
+        self.weight_dtype = weight_dtype
+        self.scale_dtype = scale_dtype
+        self.mse_range = mse_range
+        self.use_double_quant = use_double_quant
+        self.double_quant_dtype = double_quant_dtype
+        self.double_quant_scale_dtype = double_quant_scale_dtype
+        self.scheme = scheme
+        self.algorithm = algorithm
 
-@dataclass
-class WeightOnlyQuantizationConfig:
-    algorithm: str = "RTN"
-    bits: int = 8
-    group_size: int = -1
-    scheme: str = "sym"
-    enable_full_range: bool = True
+        if group_size is None:
+            self.group_size = 32
+        else:
+            self.group_size = group_size
+        if compute_dtype is None:
+            self.compute_dtype = "fp32"
+        elif isinstance(compute_dtype, str):
+            self.compute_dtype = compute_dtype
+        elif isinstance(compute_dtype, torch.dtype):
+            self.compute_dtype = convert_dtype_2_str(compute_dtype)
+        else:
+            raise ValueError("bit4_compute_dtype must be a string or a torch.dtype")
+
+        self.post_init()
+
+    def post_init(self):
+        r"""
+        Safety checker that arguments are correct - also replaces some NoneType arguments with their default values.
+        """
+
+        if self.llm_int8_skip_modules is not None and not isinstance(self.llm_int8_skip_modules, list):
+            raise ValueError("llm_int8_skip_modules must be a list of strings")
+
+        if self.compute_dtype is not None and self.compute_dtype not in ['fp32', 'bf16', 'int8']:
+            raise ValueError("compute_dtype must be 'fp32', 'bf16', 'int8'.")
+
+        if self.weight_dtype not in ['int8', 'int4_fullrange', 'int4_clip', 'nf4', 'fp4_e2m1_bnb', 'fp4_e2m1']:
+            raise ValueError(f"weight_dtype must be a string in "
+                             f"'int8', 'int4_fullrange', 'int4_clip', 'nf4', 'fp4_e2m1_bnb', 'fp4_e2m1'")
+
+        if self.scale_dtype not in ["fp32"]:
+            raise ValueError("scale_dtype must be a string in 'fp32'")
+
+        if not isinstance(self.mse_range, bool):
+            raise ValueError("mse_range must be a boolean")
+
+        if not isinstance(self.use_double_quant, bool):
+            raise ValueError("use_double_quant must be a boolean")
+
+        if self.use_double_quant and not isinstance(self.double_quant_dtype, str):
+            raise ValueError("double_quant_dtype must be a string")
+
+        if self.use_double_quant and not isinstance(self.double_quant_scale_dtype, str):
+            raise ValueError("double_quant_scale_dtype must be a string")
+
+        if not isinstance(self.group_size, int):
+            raise ValueError("group_size must be a int")
+
+        if not isinstance(self.scheme, str):
+            raise ValueError("scheme must be a string")
+
+    def quantization_method(self):
+        r"""
+        This method returns the quantization method used for the model.
+        """
+        if self.weight_dtype == 8:
+            return "s8"
+        elif self.weight_dtype == 4 and self.weight_dtype == "s4fullrange":
+            return "s4fullrange"
+        else:
+            raise ValueError("Only support int8 and int4 quantization now!")
+
+    @classmethod
+    def from_dict(cls, config_dict, return_unused_kwargs, **kwargs):
+        """
+        Instantiates a [`WeightOnlyConfig`] from a Python dictionary of parameters.
+
+        Args:
+            config_dict (`Dict[str, Any]`):
+                Dictionary that will be used to instantiate the configuration object.
+            return_unused_kwargs (`bool`):
+                Whether or not to return a list of unused keyword arguments. Used for `from_pretrained` method in
+                `PreTrainedModel`.
+            kwargs (`Dict[str, Any]`):
+                Additional parameters from which to initialize the configuration object.
+
+        Returns:
+            [`WeightOnlyConfig`]: The configuration object instantiated from those parameters.
+        """
+
+        config = cls(**config_dict)
+
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(config, key):
+                setattr(config, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        if return_unused_kwargs:
+            return config, kwargs
+        else:
+            return config
+
+    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
+        """
+        Save this instance to a JSON file.
+
+        Args:
+            json_file_path (`str` or `os.PathLike`):
+                Path to the JSON file in which this configuration instance's parameters will be saved.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            config_dict = self.to_dict()
+            json_string = json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
+
+            writer.write(json_string)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary. Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+        """
+
+        output = copy.deepcopy(self.__dict__)
+        output["compute_dtype"] = str(output["compute_dtype"]).split(".")[1]
+
+        return output
+
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+
+    def to_json_string(self, use_diff: bool = True) -> str:
+        """
+        Serializes this instance to a JSON string.
+
+        Args:
+            use_diff (`bool`, *optional*, defaults to `True`):
+                If set to `True`, only the difference between the config instance and the default `WeightOnlyConfig()`
+                is serialized to JSON string.
+
+        Returns:
+            `str`: String containing all the attributes that make up this configuration instance in JSON format.
+        """
+        if use_diff is True:
+            config_dict = self.to_diff_dict()
+        else:
+            config_dict = self.to_dict()
+        return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
+
+    def to_diff_dict(self) -> Dict[str, Any]:
+        """
+        Removes all attributes from config which correspond to the default config attributes for better readability and
+        serializes to a Python dictionary.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        config_dict = self.to_dict()
+
+        # get the default config dict
+        default_config_dict = WeightOnlyConfig().to_dict()
+
+        serializable_config_dict = {}
+
+        # only serialize values that differ from the default config
+        for key, value in config_dict.items():
+            if value != default_config_dict[key]:
+                serializable_config_dict[key] = value
+
+        return serializable_config_dict
 
 
 @dataclass
diff --git a/tests/test_quantization.py b/tests/test_quantization.py
index 49031214936..140fbfc087a 100644
--- a/tests/test_quantization.py
+++ b/tests/test_quantization.py
@@ -291,7 +291,7 @@ def test_quantization_for_llm(self):
         tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
         from intel_extension_for_transformers.transformers import (
             AMPConfig,
-            WeightOnlyQuantizationConfig,
+            WeightOnlyQuantConfig,
             SmoothQuantConfig,
             BitsAndBytesConfig
 
@@ -301,16 +301,16 @@ def test_quantization_for_llm(self):
         dummy_input = fp32_model.dummy_inputs["input_ids"]
 
         # smooth-quant
-        sq_config = SmoothQuantConfig(
-                                    tokenizer=tokenizer,  # either two of one, tokenizer or calib_func
-                                    calib_iters=5
-                                )
-        q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
-                                                    quantization_config=sq_config
-                                                )
-        self.assertTrue(isinstance(q_model, torch.jit.ScriptModule))
+        #sq_config = SmoothQuantConfig(
+        #                            tokenizer=tokenizer,  # either two of one, tokenizer or calib_func
+        #                            calib_iters=5
+        #                        )
+        #q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
+        #                                            quantization_config=sq_config
+        #                                        )
+        #self.assertTrue(isinstance(q_model, torch.jit.ScriptModule))
         # weight-only
-        woq_config = WeightOnlyQuantizationConfig()
+        woq_config = WeightOnlyQuantConfig()
         woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                                     quantization_config=woq_config
                                                 )

From 22204ee4b7d8c7e93e08a4b414be6f5419e1a68e Mon Sep 17 00:00:00 2001
From: changwangss <chang1.wang@intel.com>
Date: Sun, 17 Sep 2023 23:37:40 -0700
Subject: [PATCH 09/19] fix mp name

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 .../transformers/modeling/modeling_auto.py    |  6 +++---
 .../transformers/utils/__init__.py            |  2 +-
 .../transformers/utils/quantization_config.py |  2 +-
 tests/test_quantization.py                    | 20 +++++++++----------
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
index 517c91371fc..d5ad2705a7e 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
@@ -48,7 +48,7 @@
 
 
 from intel_extension_for_transformers.transformers import (
-    AMPConfig,
+    MixedPrecisionConfig,
     WeightOnlyQuantConfig,
     SmoothQuantConfig
 )
@@ -106,11 +106,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 kwargs["torch_dtype"] = "auto"
             quantization_config = kwargs_orig.get("quantization_config", None)
             if quantization_config is not None and not (isinstance(quantization_config, SmoothQuantConfig) or 
-                                                        isinstance(quantization_config, AMPConfig) or
+                                                        isinstance(quantization_config, MixedPrecisionConfig) or
                                                         isinstance(quantization_config, WeightOnlyQuantConfig)
                                                         ):
                 kwargs["quantization_config"] = kwargs_orig["quantization_config"]
-            if isinstance(quantization_config, AMPConfig):
+            if isinstance(quantization_config, MixedPrecisionConfig):
                 config.torch_dtype=torch.bfloat16
 
         has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map
diff --git a/intel_extension_for_transformers/transformers/utils/__init__.py b/intel_extension_for_transformers/transformers/utils/__init__.py
index c4eae8a076a..725ec6924be 100644
--- a/intel_extension_for_transformers/transformers/utils/__init__.py
+++ b/intel_extension_for_transformers/transformers/utils/__init__.py
@@ -18,7 +18,7 @@
 """Utils for optimization."""
 
 from .quantization_config import (
-    AMPConfig,
+    MixedPrecisionConfig,
     BitsAndBytesConfig,
     SmoothQuantConfig,
     WeightOnlyQuantConfig,
diff --git a/intel_extension_for_transformers/transformers/utils/quantization_config.py b/intel_extension_for_transformers/transformers/utils/quantization_config.py
index 9653c1a0dbf..87d3837cfa5 100644
--- a/intel_extension_for_transformers/transformers/utils/quantization_config.py
+++ b/intel_extension_for_transformers/transformers/utils/quantization_config.py
@@ -217,7 +217,7 @@ def to_diff_dict(self) -> Dict[str, Any]:
 
 
 @dataclass
-class AMPConfig:
+class MixedPrecisionConfig:
     dtype: str = "bfloat16"
 
 @dataclass
diff --git a/tests/test_quantization.py b/tests/test_quantization.py
index 140fbfc087a..c9d027ea8d2 100644
--- a/tests/test_quantization.py
+++ b/tests/test_quantization.py
@@ -290,7 +290,7 @@ def test_quantization_for_llm(self):
         model_name_or_path = "facebook/opt-125m"
         tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
         from intel_extension_for_transformers.transformers import (
-            AMPConfig,
+            MixedPrecisionConfig,
             WeightOnlyQuantConfig,
             SmoothQuantConfig,
             BitsAndBytesConfig
@@ -301,14 +301,14 @@ def test_quantization_for_llm(self):
         dummy_input = fp32_model.dummy_inputs["input_ids"]
 
         # smooth-quant
-        #sq_config = SmoothQuantConfig(
-        #                            tokenizer=tokenizer,  # either two of one, tokenizer or calib_func
-        #                            calib_iters=5
-        #                        )
-        #q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
-        #                                            quantization_config=sq_config
-        #                                        )
-        #self.assertTrue(isinstance(q_model, torch.jit.ScriptModule))
+        sq_config = SmoothQuantConfig(
+                                    tokenizer=tokenizer,  # either two of one, tokenizer or calib_func
+                                    calib_iters=5
+                                )
+        q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
+                                                    quantization_config=sq_config
+                                                )
+        self.assertTrue(isinstance(q_model, torch.jit.ScriptModule))
         # weight-only
         woq_config = WeightOnlyQuantConfig()
         woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
@@ -317,7 +317,7 @@ def test_quantization_for_llm(self):
         output = woq_model(dummy_input)
         self.assertTrue(float(output[0][0][0][0]), -7.139640808105469)
         # amp
-        amp_config = AMPConfig() 
+        amp_config = MixedPrecisionConfig() 
         amp_model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                                     quantization_config=amp_config
                                                 )

From caa470442baf3c1e219154a4c6ca2831b313c6ce Mon Sep 17 00:00:00 2001
From: changwangss <chang1.wang@intel.com>
Date: Sun, 17 Sep 2023 23:58:35 -0700
Subject: [PATCH 10/19] fix pylint

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 intel_extension_for_transformers/transformers/__init__.py | 2 +-
 .../transformers/utils/quantization_config.py             | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/intel_extension_for_transformers/transformers/__init__.py b/intel_extension_for_transformers/transformers/__init__.py
index 18ce4ef6e87..1a13a84ffec 100644
--- a/intel_extension_for_transformers/transformers/__init__.py
+++ b/intel_extension_for_transformers/transformers/__init__.py
@@ -41,7 +41,7 @@
 from .pruning import SUPPORTED_PRUNING_MODE, PrunerConfig, PruningMode
 from .quantization import SUPPORTED_QUANT_MODE, QuantizationMode
 from .utils import (
-    AMPConfig,
+    MixedPrecisionConfig,
     BitsAndBytesConfig,
     SmoothQuantConfig,
     WeightOnlyQuantConfig,
diff --git a/intel_extension_for_transformers/transformers/utils/quantization_config.py b/intel_extension_for_transformers/transformers/utils/quantization_config.py
index 87d3837cfa5..1d5edc3c658 100644
--- a/intel_extension_for_transformers/transformers/utils/quantization_config.py
+++ b/intel_extension_for_transformers/transformers/utils/quantization_config.py
@@ -117,7 +117,7 @@ def quantization_method(self):
     @classmethod
     def from_dict(cls, config_dict, return_unused_kwargs, **kwargs):
         """
-        Instantiates a [`WeightOnlyConfig`] from a Python dictionary of parameters.
+        Instantiates a [`WeightOnlyQuantConfig`] from a Python dictionary of parameters.
 
         Args:
             config_dict (`Dict[str, Any]`):
@@ -129,7 +129,7 @@ def from_dict(cls, config_dict, return_unused_kwargs, **kwargs):
                 Additional parameters from which to initialize the configuration object.
 
         Returns:
-            [`WeightOnlyConfig`]: The configuration object instantiated from those parameters.
+            [`WeightOnlyQuantConfig`]: The configuration object instantiated from those parameters.
         """
 
         config = cls(**config_dict)
@@ -181,7 +181,7 @@ def to_json_string(self, use_diff: bool = True) -> str:
 
         Args:
             use_diff (`bool`, *optional*, defaults to `True`):
-                If set to `True`, only the difference between the config instance and the default `WeightOnlyConfig()`
+                If set to `True`, only the difference between the config instance and the default `WeightOnlyQuantConfig()`
                 is serialized to JSON string.
 
         Returns:
@@ -204,7 +204,7 @@ def to_diff_dict(self) -> Dict[str, Any]:
         config_dict = self.to_dict()
 
         # get the default config dict
-        default_config_dict = WeightOnlyConfig().to_dict()
+        default_config_dict = WeightOnlyQuantConfig().to_dict()
 
         serializable_config_dict = {}
 

From 1c1834900bd070892ac3ab0b67fab05752f399a8 Mon Sep 17 00:00:00 2001
From: changwangss <chang1.wang@intel.com>
Date: Mon, 18 Sep 2023 00:18:19 -0700
Subject: [PATCH 11/19] fix import

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 .../transformers/utils/quantization_config.py                | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/intel_extension_for_transformers/transformers/utils/quantization_config.py b/intel_extension_for_transformers/transformers/utils/quantization_config.py
index 1d5edc3c658..b5a528e5415 100644
--- a/intel_extension_for_transformers/transformers/utils/quantization_config.py
+++ b/intel_extension_for_transformers/transformers/utils/quantization_config.py
@@ -23,7 +23,6 @@
 from typing import Any, Optional, Dict, Union
 from .utility import LazyImport
 from transformers import BitsAndBytesConfig
-from intel_extension_for_transformers.llm.quantization.utils import convert_dtype_2_str
 torch = LazyImport("torch")
 
 class WeightOnlyQuantConfig:
@@ -42,6 +41,7 @@ def __init__(
         algorithm="RTN",
         **kwargs,
     ):
+        from intel_extension_for_transformers.llm.quantization.utils import convert_dtype_2_str
         self.llm_int8_skip_modules = llm_int8_skip_modules if llm_int8_skip_modules else []
         self.weight_dtype = weight_dtype
         self.scale_dtype = scale_dtype
@@ -181,7 +181,8 @@ def to_json_string(self, use_diff: bool = True) -> str:
 
         Args:
             use_diff (`bool`, *optional*, defaults to `True`):
-                If set to `True`, only the difference between the config instance and the default `WeightOnlyQuantConfig()`
+                If set to `True`, only the difference between the config instance and the default
+                `WeightOnlyQuantConfig()`
                 is serialized to JSON string.
 
         Returns:

From 4ef0a790c5bb80e545af8c4639df02d84d36a488 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Mon, 18 Sep 2023 16:21:58 +0800
Subject: [PATCH 12/19] Fixed shape error for weight-only quantization op

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 .../llm/quantization/nn/modules.py                       | 9 +++++++--
 tests/test_weight_only.py                                | 2 +-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/intel_extension_for_transformers/llm/quantization/nn/modules.py b/intel_extension_for_transformers/llm/quantization/nn/modules.py
index 3a3ddf0d283..278ab0bf144 100644
--- a/intel_extension_for_transformers/llm/quantization/nn/modules.py
+++ b/intel_extension_for_transformers/llm/quantization/nn/modules.py
@@ -18,6 +18,8 @@
 
 import os
 import torch
+from functools import reduce
+from operator import mul
 from peft.tuners.lora import LoraLayer
 from ..autograd import matmul_4bit
 
@@ -98,13 +100,16 @@ def forward(self, x: torch.Tensor):
         if getattr(self.weight, 'quant_state', None) is None:
             print('FP4 quantization state not initialized. Please call .quantize_weights().')
 
-        m = x.size()[0]
+        shape = list(x.size())
+        m = reduce(mul, shape[0:-1])
         out = torch.zeros(m, self.out_features, dtype=x.dtype)
         bias = None if self.bias is None else self.bias.data
         torch.ops.weight_only_jblasop.qbits_linear(
-            x, self.weight.data, bias, out,
+            x.view(m, shape[-1]), self.weight.data, bias, out,
             self.out_features, self.bias is not None, self.compute_dtype, self.weight_dtype
         )
+        shape[-1] = self.out_features
+        out = out.view(shape)
 
         return out
 
diff --git a/tests/test_weight_only.py b/tests/test_weight_only.py
index 291a18672c5..dd9c1af54b0 100644
--- a/tests/test_weight_only.py
+++ b/tests/test_weight_only.py
@@ -51,7 +51,7 @@ def test_int4(self):
             model = M(with_bias=bias)
             with torch.no_grad():
                 model.linear.weight = torch.nn.Parameter(revert_wei)
-            activation = torch.rand(1, 32, dtype=torch.float)
+            activation = torch.rand(1, 5, 32, dtype=torch.float)
             output = model(activation)
             with torch.no_grad():
                 model.linear.weight = torch.nn.Parameter(raw_wei)

From 34d2354e3dab2009369863c98a295164cc3016d2 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Mon, 18 Sep 2023 16:56:00 +0800
Subject: [PATCH 13/19] Fixed UT error for weight-only quantization

Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
---
 .../llm/quantization/config/__init__.py       |   3 -
 .../config/quantization_config.py             | 212 ------------------
 tests/test_weight_only.py                     |  10 +-
 3 files changed, 5 insertions(+), 220 deletions(-)
 delete mode 100644 intel_extension_for_transformers/llm/quantization/config/quantization_config.py

diff --git a/intel_extension_for_transformers/llm/quantization/config/__init__.py b/intel_extension_for_transformers/llm/quantization/config/__init__.py
index 1df7ead0cec..18896e7b549 100644
--- a/intel_extension_for_transformers/llm/quantization/config/__init__.py
+++ b/intel_extension_for_transformers/llm/quantization/config/__init__.py
@@ -14,6 +14,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-
-from .quantization_config import WeightOnlyConfig
\ No newline at end of file
diff --git a/intel_extension_for_transformers/llm/quantization/config/quantization_config.py b/intel_extension_for_transformers/llm/quantization/config/quantization_config.py
deleted file mode 100644
index 6845774653d..00000000000
--- a/intel_extension_for_transformers/llm/quantization/config/quantization_config.py
+++ /dev/null
@@ -1,212 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import copy
-import json
-import os
-import torch
-from ..utils import convert_dtype_2_str
-from typing import Any, Dict, Union
-
-
-class WeightOnlyConfig:
-    def __init__(
-        self,
-        llm_int8_skip_modules=None,
-        compute_dtype=None,
-        weight_dtype="int4_fullrange", # int8 int4_clip, int4_fullrange fp4_e2m1_bnb fp4_e2m1 nf4
-        scale_dtype="fp32", # Now only fp32
-        mse_range=False,
-        use_double_quant=False,
-        double_quant_dtype="int8", # reserve for double quant
-        double_quant_scale_dtype="fp32", # reserve for double quant
-        group_size=None,
-        scheme="sym",
-        **kwargs,
-    ):
-        self.llm_int8_skip_modules = llm_int8_skip_modules if llm_int8_skip_modules else []
-        self.weight_dtype = weight_dtype
-        self.scale_dtype = scale_dtype
-        self.mse_range = mse_range
-        self.use_double_quant = use_double_quant
-        self.double_quant_dtype = double_quant_dtype
-        self.double_quant_scale_dtype = double_quant_scale_dtype
-        self.scheme = scheme
-
-        if group_size is None:
-            self.group_size = 32
-        else:
-            self.group_size = group_size
-        if compute_dtype is None:
-            self.compute_dtype = "fp32"
-        elif isinstance(compute_dtype, str):
-            self.compute_dtype = compute_dtype
-        elif isinstance(compute_dtype, torch.dtype):
-            self.compute_dtype = convert_dtype_2_str(compute_dtype)
-        else:
-            raise ValueError("bit4_compute_dtype must be a string or a torch.dtype")
-
-        self.post_init()
-
-    def post_init(self):
-        r"""
-        Safety checker that arguments are correct - also replaces some NoneType arguments with their default values.
-        """
-
-        if self.llm_int8_skip_modules is not None and not isinstance(self.llm_int8_skip_modules, list):
-            raise ValueError("llm_int8_skip_modules must be a list of strings")
-
-        if self.compute_dtype is not None and self.compute_dtype not in ['fp32', 'bf16', 'int8']:
-            raise ValueError("compute_dtype must be 'fp32', 'bf16', 'int8'.")
-
-        if self.weight_dtype not in ['int8', 'int4_fullrange', 'int4_clip', 'nf4', 'fp4_e2m1_bnb', 'fp4_e2m1']:
-            raise ValueError(f"weight_dtype must be a string in "
-                             f"'int8', 'int4_fullrange', 'int4_clip', 'nf4', 'fp4_e2m1_bnb', 'fp4_e2m1'")
-
-        if self.scale_dtype not in ["fp32"]:
-            raise ValueError("scale_dtype must be a string in 'fp32'")
-
-        if not isinstance(self.mse_range, bool):
-            raise ValueError("mse_range must be a boolean")
-
-        if not isinstance(self.use_double_quant, bool):
-            raise ValueError("use_double_quant must be a boolean")
-
-        if self.use_double_quant and not isinstance(self.double_quant_dtype, str):
-            raise ValueError("double_quant_dtype must be a string")
-
-        if self.use_double_quant and not isinstance(self.double_quant_scale_dtype, str):
-            raise ValueError("double_quant_scale_dtype must be a string")
-
-        if not isinstance(self.group_size, int):
-            raise ValueError("group_size must be a int")
-
-        if not isinstance(self.scheme, str):
-            raise ValueError("scheme must be a string")
-
-    def quantization_method(self):
-        r"""
-        This method returns the quantization method used for the model.
-        """
-        if self.weight_dtype == 8:
-            return "s8"
-        elif self.weight_dtype == 4 and self.weight_dtype == "s4fullrange":
-            return "s4fullrange"
-        else:
-            raise ValueError("Only support int8 and int4 quantization now!")
-
-    @classmethod
-    def from_dict(cls, config_dict, return_unused_kwargs, **kwargs):
-        """
-        Instantiates a [`WeightOnlyConfig`] from a Python dictionary of parameters.
-
-        Args:
-            config_dict (`Dict[str, Any]`):
-                Dictionary that will be used to instantiate the configuration object.
-            return_unused_kwargs (`bool`):
-                Whether or not to return a list of unused keyword arguments. Used for `from_pretrained` method in
-                `PreTrainedModel`.
-            kwargs (`Dict[str, Any]`):
-                Additional parameters from which to initialize the configuration object.
-
-        Returns:
-            [`WeightOnlyConfig`]: The configuration object instantiated from those parameters.
-        """
-
-        config = cls(**config_dict)
-
-        to_remove = []
-        for key, value in kwargs.items():
-            if hasattr(config, key):
-                setattr(config, key, value)
-                to_remove.append(key)
-        for key in to_remove:
-            kwargs.pop(key, None)
-
-        if return_unused_kwargs:
-            return config, kwargs
-        else:
-            return config
-
-    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
-        """
-        Save this instance to a JSON file.
-
-        Args:
-            json_file_path (`str` or `os.PathLike`):
-                Path to the JSON file in which this configuration instance's parameters will be saved.
-        """
-        with open(json_file_path, "w", encoding="utf-8") as writer:
-            config_dict = self.to_dict()
-            json_string = json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
-
-            writer.write(json_string)
-
-    def to_dict(self) -> Dict[str, Any]:
-        """
-        Serializes this instance to a Python dictionary. Returns:
-            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
-        """
-
-        output = copy.deepcopy(self.__dict__)
-        output["compute_dtype"] = str(output["compute_dtype"]).split(".")[1]
-
-        return output
-
-    def __repr__(self):
-        return f"{self.__class__.__name__} {self.to_json_string()}"
-
-    def to_json_string(self, use_diff: bool = True) -> str:
-        """
-        Serializes this instance to a JSON string.
-
-        Args:
-            use_diff (`bool`, *optional*, defaults to `True`):
-                If set to `True`, only the difference between the config instance and the default `WeightOnlyConfig()`
-                is serialized to JSON string.
-
-        Returns:
-            `str`: String containing all the attributes that make up this configuration instance in JSON format.
-        """
-        if use_diff is True:
-            config_dict = self.to_diff_dict()
-        else:
-            config_dict = self.to_dict()
-        return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
-
-    def to_diff_dict(self) -> Dict[str, Any]:
-        """
-        Removes all attributes from config which correspond to the default config attributes for better readability and
-        serializes to a Python dictionary.
-
-        Returns:
-            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        config_dict = self.to_dict()
-
-        # get the default config dict
-        default_config_dict = WeightOnlyConfig().to_dict()
-
-        serializable_config_dict = {}
-
-        # only serialize values that differ from the default config
-        for key, value in config_dict.items():
-            if value != default_config_dict[key]:
-                serializable_config_dict[key] = value
-
-        return serializable_config_dict
\ No newline at end of file
diff --git a/tests/test_weight_only.py b/tests/test_weight_only.py
index dd9c1af54b0..90d2ba54b19 100644
--- a/tests/test_weight_only.py
+++ b/tests/test_weight_only.py
@@ -4,7 +4,7 @@
 from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM
 from intel_extension_for_transformers.llm.quantization.nn.modules import QuantizedLinearQBits
 from intel_extension_for_transformers.llm.quantization.utils import convert_to_quantized_model, replace_linear
-from intel_extension_for_transformers.llm.quantization.config import WeightOnlyConfig
+from intel_extension_for_transformers.transformers import WeightOnlyQuantConfig
 
 
 class M(torch.nn.Module):
@@ -33,7 +33,7 @@ def test_int8(self):
             activation = torch.rand(1,32, dtype=torch.float)
             output = model(activation)
 
-            config = WeightOnlyConfig(weight_dtype="int8", group_size=32)
+            config = WeightOnlyQuantConfig(weight_dtype="int8", group_size=32)
             convert_to_quantized_model(model, config)
             output_quant = model(activation)
             print(output)
@@ -56,7 +56,7 @@ def test_int4(self):
             with torch.no_grad():
                 model.linear.weight = torch.nn.Parameter(raw_wei)
 
-            config = WeightOnlyConfig(weight_dtype="int4_fullrange", group_size=32)
+            config = WeightOnlyQuantConfig(weight_dtype="int4_fullrange", group_size=32)
             convert_to_quantized_model(model, config)
             output_quant = model(activation)
             print(output)
@@ -80,7 +80,7 @@ def test_int4(self):
     #             return x
 
     #     model = LinearPredictor()
-    #     replace_linear(model, None, None, WeightOnlyConfig(weight_dtype='int4_fullrange'))
+    #     replace_linear(model, None, None, WeightOnlyQuantConfig(weight_dtype='int4_fullrange'))
     #     lossfn = torch.nn.MSELoss()
     #     optimizer = torch.optim.SGD([p for p in model.parameters() if p.requires_grad], lr=1e-3)
     #     batch_size = 16
@@ -108,7 +108,7 @@ def test_auto_model(self):
         self.assertTrue(len(module_list) > 0)
 
     def test_auto_model_with_config(self):
-        config = WeightOnlyConfig()
+        config = WeightOnlyQuantConfig()
         model = AutoModelForCausalLM.from_pretrained(llama_model_path, quantization_config=config)
         module_list = []
         for name, module in model.named_modules():

From 14df0f3533a8dfc2f315a8a8eae09ace9fb1e0f0 Mon Sep 17 00:00:00 2001
From: changwangss <chang1.wang@intel.com>
Date: Mon, 18 Sep 2023 04:01:48 -0700
Subject: [PATCH 14/19] improve the example

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 .../text-generation/quantization/README.md    | 155 +++++++++---------
 .../text-generation/quantization/build_env.sh | 118 +++++++++++++
 .../quantization/requirements.txt             |   1 +
 .../quantization/run_generation.py            | 143 ++++++++--------
 .../transformers/modeling/modeling_auto.py    | 128 +++++----------
 5 files changed, 308 insertions(+), 237 deletions(-)
 create mode 100644 examples/huggingface/pytorch/text-generation/quantization/build_env.sh

diff --git a/examples/huggingface/pytorch/text-generation/quantization/README.md b/examples/huggingface/pytorch/text-generation/quantization/README.md
index 58a25f7fd2b..204f31b69cc 100644
--- a/examples/huggingface/pytorch/text-generation/quantization/README.md
+++ b/examples/huggingface/pytorch/text-generation/quantization/README.md
@@ -6,106 +6,101 @@ We provide the inference benchmarking script `run_generation.py` for [EleutherAI
 
 # Prerequisite​
 ## 1. Create Environment​
-If you want to use Pytorch & Intel-extension-for-pytorch version 2.0.1, please 
-```
-pip install -r requirements.txt
-```
-If you want to use Pytorch & Intel-extension-for-pytorch version 2.1, the dependent packages are listed in requirements, we recommend create environment as the following steps.
+Pytorch & Intel-extension-for-pytorch version 2.1 is required, the dependent packages are listed in requirements, we recommend create environment as the following steps.
 
 ```bash
-WORK_DIR=$PWD
-# GCC 12.3 is required, please set it firstly
-# Create environment (conda recommended)
-conda create -n llm python=3.9 -y
-# install deps, please try gcc, gxx 12.2 if 12.3 doesn't find from conda
-conda install gcc=12.3 gxx=12.3 cxx-compiler -c conda-forge -y
-conda install cmake ninja mkl mkl-include -y
-conda install gperftools -c conda-forge -y
-
-# Install PyTorch
-python -m pip install https://download.pytorch.org/whl/nightly/cpu/torch-2.1.0.dev20230711%2Bcpu-cp39-cp39-linux_x86_64.whl
-
-# Install IPEX with semi-compiler, require gcc 12.3 or 12.2
-rm -rf llvm-project && mkdir llvm-project && cd llvm-project
-wget https://github.com/llvm/llvm-project/releases/download/llvmorg-16.0.6/cmake-16.0.6.src.tar.xz
-wget https://github.com/llvm/llvm-project/releases/download/llvmorg-16.0.6/llvm-16.0.6.src.tar.xz
-tar -xf cmake-16.0.6.src.tar.xz && mv cmake-16.0.6.src cmake
-tar -xf llvm-16.0.6.src.tar.xz && mv llvm-16.0.6.src llvm
-mkdir build && cd build
-cmake ../llvm -DCMAKE_INSTALL_PREFIX=${PWD}/_install/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_ENABLE_TERMINFO=OFF -DLLVM_INCLUDE_BENCHMARKS=OFF -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=0"
-make install -j$(nproc)
-ln -s ${PWD}/_install/llvm/bin/llvm-config ${CONDA_PREFIX}/bin/llvm-config-13
-cd ../../
-
-git clone --branch llm_feature_branch https://github.com/intel/intel-extension-for-pytorch.git
-cd intel-extension-for-pytorch
-git submodule sync && git submodule update --init --recursive
-export DNNL_GRAPH_BUILD_COMPILER_BACKEND=1
-export CXXFLAGS="${CXXFLAGS} -D__STDC_FORMAT_MACROS"
-python setup.py install
-cd ../
-
-# disable semi-compiler to avoid accuracy regression for mpt and neural-chat-v1-1 models, other models don't need it.
-export _DNNL_DISABLE_COMPILER_BACKEND=1
-
-# Install neural-compressor
-git clone https://github.com/intel/neural-compressor.git
-cd  neural-compressor
+conda create -n llm python=3.9 -
+conda activate llm
+bash build_env.sh
+git clone https://github.com/intel/intel-extension-for-transformers.git
+cd intel-extension-for-transformers
 pip install -r requirements.txt
 python setup.py install
+```
+> Note:
+> Disable semi-compiler to avoid accuracy regression for mpt and neural-chat-v1-1 models, other > models don't need it.
+> `export _DNNL_DISABLE_COMPILER_BACKEND=1`
 
-# Install lm_eval
-pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@83dbfbf6070324f3e5872f63e49d49ff7ef4c9b3
-# Install others deps
-pip install transformers optimum-intel cpuid accelerate datasets sentencepiece protobuf==3.20.3
-````
-We use the GPTJ defination script [modeling_gptj.py](https://github.com/intel/intel-extension-for-transformers/blob/main/intel_extension_for_transformers/transformers/modeling/gptj/modeling_gptj.py) in `run_generation.py`. Here is a little change to success trace.
-```diff
-# Line 602 in modeling_gptj.py on transformers 4.28.1
+> Note: if `ImportError: /lib64/libstdc++.so.6: version ``GLIBCXX_3.4.29`` not found` error raised when import intel-extension-for-pytorch, it is due to the high gcc library request, there is the solution to find the correct version.
+> ```bash
+> find $CONDA_PREFIX | grep libstdc++.so.6
+> export LD_PRELOAD=<the path of libstdc++.so.6>:${LD_PRELOAD}
+> ```
 
--   position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
-+   position_ids = torch.arange(past_length, torch.tensor(input_shape[-1]) + torch.tensor(past_length), dtype=torch.long, device=device)
-```
-The changes for `llama` series models in `modeling_llama.py`, `dolly_v2_3b` series models in `modeling_gpt_neox.py`， `bloom` series models in `modeling_bloom.py` and `opt` series models in `modeling_opt.py` are similar to the above.
 
 
 # Run
+We support compression technologies such as `MixedPrecision`, `SmoothQuant` and `WeightOnlyQuant` with `RTN/AWQ/TEQ/GPTQ` algorithms, `BitsAndBytes` based transformers also works, the followings are command to show how to use it.
 
-## 1. Quantization
+## 1. Performance
 ``` bash
-# --int8 is used for int8 only.
-# --int8_bf16_mixed is used for int8 mixed bfloat16 precision.
-python run_generation.py \
-    --model EleutherAI/gpt-j-6b \
-    --quantize \
-    --sq \
-    --alpha 1.0 \
-    --int8_bf16_mixed \
-    --ipex
-```
-## 2. Performance
-```bash
-# --int8 is used for int8 only.
-# --int8_bf16_mixed is used for int8 mixed bfloat16 precision.
 export KMP_BLOCKTIME=1
 export KMP_SETTINGS=1
 export KMP_AFFINITY=granularity=fine,compact,1,0
 export LD_PRELOAD=${CONDA_PREFIX}/lib/libiomp5.so
 export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
+# fp32
 OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation.py \
     --model EleutherAI/gpt-j-6b \
-    --benchmark \
-    --int8_bf16_mixed \
-    --ipex
+    --benchmark
+# mixedprecision
+OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation.py \
+    --model EleutherAI/gpt-j-6b \
+    --mixed_precision \
+    --benchmark
+# smoothquant
+# --int8_bf16_mixed is used for int8 mixed bfloat16 precision.
+OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation.py \
+    --model EleutherAI/gpt-j-6b \
+    --sq \
+    --alpha 1.0 \
+    --int8 \
+    --benchmark
+# weightonlyquant
+OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation.py \
+    --model EleutherAI/gpt-j-6b \
+    --woq \
+    --benchmark
+# bitsandbytes
+OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation.py \
+    --model EleutherAI/gpt-j-6b \
+    --bitsandbytes \
+    --benchmark
+
 ```
-## 3. Accuracy
+
+## 2. Accuracy
 ```bash
-# --int8 is used for int8 only.
+# fp32
+python run_generation.py \
+    --model EleutherAI/gpt-j-6b \
+    --accuracy \
+    --tasks "lambada_openai"
+# mixedprecision
+python run_generation.py \
+    --model EleutherAI/gpt-j-6b \
+    --mixed_precision \
+    --accuracy \
+    --tasks "lambada_openai"
+# smoothquant
 # --int8_bf16_mixed is used for int8 mixed bfloat16 precision.
 python run_generation.py \
-   --model EleutherAI/gpt-j-6b \
-   --accuracy \
-   --int8_bf16_mixed \
-   --ipex \
-   --tasks "lambada_openai"
+    --model EleutherAI/gpt-j-6b \
+    --sq \
+    --alpha 1.0 \
+    --int8 \
+    --accuracy \
+    --tasks "lambada_openai"
+# weightonlyquant
+python run_generation.py \
+    --model EleutherAI/gpt-j-6b \
+    --woq \
+    --accuracy \
+    --tasks "lambada_openai"
+# bitsandbytes
+python run_generation.py \
+    --model EleutherAI/gpt-j-6b \
+    --bitsandbytes \
+    --accuracy \
+    --tasks "lambada_openai"
 ```
diff --git a/examples/huggingface/pytorch/text-generation/quantization/build_env.sh b/examples/huggingface/pytorch/text-generation/quantization/build_env.sh
new file mode 100644
index 00000000000..2e81bd8e8d3
--- /dev/null
+++ b/examples/huggingface/pytorch/text-generation/quantization/build_env.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+  2 set -x
+  3 set -e
+  4
+  5 VER_LLVM="llvmorg-16.0.6"
+  6 VER_IPEX="7256d0848ba81bb802dd33fca0e33049a751db58"
+  7
+  8 # Check existance of required Linux commands
+  9 for CMD in conda git nproc make; do
+ 10     command -v ${CMD} || (echo "Error: Command \"${CMD}\" not found." ; exit 4)
+ 11 done
+ 12
+ 13 MAX_JOBS_VAR=$(nproc)
+ 14 if [ ! -z "${MAX_JOBS}" ]; then
+ 15     MAX_JOBS_VAR=${MAX_JOBS}
+ 16 fi
+ 17
+ 18 # Save current directory path
+ 19 BASEFOLDER=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+ 20 cd ${BASEFOLDER}
+ 21 # Checkout individual components
+ 22 if [ ! -d llvm-project ]; then
+ 23     git clone https://github.com/llvm/llvm-project.git
+ 24 fi
+ 25 if [ ! -d intel-extension-for-pytorch ]; then
+ 26     git clone https://github.com/intel/intel-extension-for-pytorch.git
+ 27 fi
+ 28
+ 29 # Checkout required branch/commit and update submodules
+ 30 cd llvm-project
+ 31 if [ ! -z ${VER_LLVM} ]; then
+ 32     git checkout ${VER_LLVM}
+ 33 fi
+ 34 git submodule sync
+ 35 git submodule update --init --recursive
+ 36 cd ..
+ 37 cd intel-extension-for-pytorch
+ 38 if [ ! -z ${VER_IPEX} ]; then
+ 39     git checkout ${VER_IPEX}
+ 40 fi
+ 41 git submodule sync
+ 42 git submodule update --init --recursive
+ 43 cd ..
+ 44
+ 45 # Install dependencies
+ 46 conda install -y gcc==12.3 gxx==12.3 cxx-compiler -c conda-forge
+ 47 conda update -y sysroot_linux-64
+ 48 python -m pip install cmake
+ 49 python -m pip install https://download.pytorch.org/whl/nightly/cpu/torch-2.1.0.dev20230711%2Bcpu-cp39-cp39-linux_x86_64.whl
+ 50 ABI=$(python -c "import torch; print(int(torch._C._GLIBCXX_USE_CXX11_ABI))")
+ 51
+ 52 # Compile individual component
+ 53 export CC=${CONDA_PREFIX}/bin/gcc
+ 54 export CXX=${CONDA_PREFIX}/bin/g++
+ 55 export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}
+
+#  LLVM
+cd llvm-project
+LLVM_ROOT="$(pwd)/release"
+if [ -d ${LLVM_ROOT} ]; then
+    rm -rf ${LLVM_ROOT}
+fi
+if [ -d build ]; then
+    rm -rf build
+fi
+mkdir build
+cd build
+echo "***************************** cmake *****************************" > ../build.log
+cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=${ABI}" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_ENABLE_TERMINFO=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_BENCHMARKS=OFF ../llvm 2>&1 | tee -a ../build.log
+echo "***************************** build *****************************" >> ../build.log
+cmake --build . -j ${MAX_JOBS_VAR} 2>&1 | tee -a ../build.log
+echo "**************************** install ****************************" >> ../build.log
+cmake -DCMAKE_INSTALL_PREFIX=${LLVM_ROOT} -P cmake_install.cmake 2>&1 | tee -a ../build.log
+#xargs rm -rf < install_manifest.txt
+cd ..
+rm -rf build
+ln -s ${LLVM_ROOT}/bin/llvm-config ${LLVM_ROOT}/bin/llvm-config-13
+export PATH=${LLVM_ROOT}/bin:$PATH
+export LD_LIBRARY_PATH=${LLVM_ROOT}/lib:$LD_LIBRARY_PATH
+cd ..
+#  Intel® Extension for PyTorch*
+cd intel-extension-for-pytorch
+python -m pip install -r requirements.txt
+export LLVM_DIR=${LLVM_ROOT}/lib/cmake/llvm
+export DNNL_GRAPH_BUILD_COMPILER_BACKEND=1
+CXXFLAGS_BK=${CXXFLAGS}
+export CXXFLAGS="${CXXFLAGS} -D__STDC_FORMAT_MACROS"
+python setup.py clean
+python setup.py bdist_wheel 2>&1 | tee build.log
+export CXXFLAGS=${CXXFLAGS_BK}
+unset DNNL_GRAPH_BUILD_COMPILER_BACKEND
+unset LLVM_DIR
+python -m pip install --force-reinstall dist/*.whl
+cd ..
+
+# Sanity Test
+set +x
+export LD_PRELOAD=${CONDA_PREFIX}/lib/libstdc++.so
+echo "Note: Should you experience \"version \`GLIBCXX_N.N.NN' not found\" error, run command \"export LD_PRELOAD=${CONDA_PREFIX}/lib/libstdc++.so\" and try again."
+python -c "import torch; import intel_extension_for_pytorch as ipex; print(f'torch_cxx11_abi:     {torch._C._GLIBCXX_USE_CXX11_ABI}'); print(f'torch_version:       {torch.__version__}'); print(f'ipex_version:        {ipex.__version__}');"
+# Install neural-compressor
+git clone https://github.com/intel/neural-compressor.git
+cd  neural-compressor
+pip install -r requirements.txt
+python setup.py install
+cd ..
+
+# Install intel-extension-for-pytorch
+git checkout -b int8_llama2
+pip install -r requirements.txt
+python setup.py install
+cd ..
+
+# Install lm_eval
+pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@83dbfbf6070324f3e5872f63e49d49ff7ef4c9b3
+# Install others deps
+pip install transformers optimum-intel cpuid accelerate datasets sentencepiece protobuf==3.20.3
+
diff --git a/examples/huggingface/pytorch/text-generation/quantization/requirements.txt b/examples/huggingface/pytorch/text-generation/quantization/requirements.txt
index bd067d4e1e6..4858296fa24 100644
--- a/examples/huggingface/pytorch/text-generation/quantization/requirements.txt
+++ b/examples/huggingface/pytorch/text-generation/quantization/requirements.txt
@@ -1,5 +1,6 @@
 accelerate
 datasets >= 2.0
+peft
 protobuf
 sentencepiece != 0.1.92
 torch >= 1.10.0
diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py
index b0529869455..02f5600d712 100644
--- a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py
+++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py
@@ -2,22 +2,18 @@
 import re
 import time
 import json
-import os
-import pathlib
 import torch
-import types
-from pathlib import Path
-from datasets import load_dataset, load_from_disk
-from torch.nn.functional import pad
-from torch.utils.data import DataLoader
-from transformers import AutoConfig, AutoTokenizer, PretrainedConfig
+from transformers import AutoConfig, AutoTokenizer
+from intel_extension_for_transformers.transformers import AutoModelForCausalLM
 from transformers.utils import check_min_version
-import transformers
-import numpy as np
-from itertools import chain
-from optimum.utils import NormalizedConfigManager
 from optimum.intel.generation.modeling import TSModelForCausalLM
+from intel_extension_for_transformers.transformers import (
+    MixedPrecisionConfig,
+    WeightOnlyQuantConfig,
+    SmoothQuantConfig,
+    BitsAndBytesConfig
 
+) 
 
 parser = argparse.ArgumentParser()
 parser.add_argument(
@@ -28,53 +24,74 @@
 parser.add_argument(
     "--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k"
 )
-parser.add_argument("--dtype", type=str, default="int8")
 parser.add_argument(
     "--max-new-tokens", default=32, type=int, help="output max new tokens"
 )
 parser.add_argument("--output_dir", nargs="?", default="./saved_results")
-parser.add_argument("--quantize", action="store_true")
-parser.add_argument("--alpha", default="auto", help="Smooth quant parameter.")
-parser.add_argument(
-    "--pad_max_length", default=512, type=int, help="Pad input ids to max length."
-)
 parser.add_argument("--int8", action="store_true")
 parser.add_argument(
     "--int8_bf16_mixed",
     action="store_true",
     help="by default it is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)",
 )
+parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model")
+# ============Benchmark configs==============
 parser.add_argument("--benchmark", action="store_true")
 parser.add_argument("--iters", default=100, type=int, help="num iter")
 parser.add_argument("--num_warmup", default=10, type=int, help="num warmup")
+# ============Accuracy configs==============
 parser.add_argument("--accuracy", action="store_true")
-parser.add_argument("--batch_size", default=1, type=int,
+parser.add_argument("--batch_size", default=56, type=int,
                     help="batch size num.")
 parser.add_argument("--save_accuracy_path", default=None,
                     help="Save accuracy results path.")
-parser.add_argument("--tasks", nargs='+', default=["winogrande", "copa", "piqa", "rte", "hellaswag", \
-                    "openbookqa", "lambada_openai", "lambada_standard", "wikitext"], type=str, \
+parser.add_argument("--tasks", nargs='+', default=["lambada_openai"], type=str, \
                     help="tasks list for accuracy validation")
-
+# ============MixedPrecision configs==============
+parser.add_argument("--mixed_precision", action="store_true")
+# ============SmoothQuant configs==============
+parser.add_argument("--sq", action="store_true")
+parser.add_argument("--alpha", default="0.5", help="Smooth quant parameter.")
+# ============WeightOnlyQuant configs===============
+parser.add_argument("--woq", action="store_true")
+parser.add_argument("--woq_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ'], 
+                    help="Weight-only parameter.")
+parser.add_argument("--woq_dtype", type=str, default="int8", 
+                    choices=["int8", "int4_clip", "int4_fullrange", "fp4_e2m1_bnb", "fp4_e2m1", "nf4"])
+parser.add_argument("--woq_group_size", type=int, default=-1)
+parser.add_argument("--woq_scheme", default="sym")
+parser.add_argument("--woq_enable_mse_search", action="store_true")
+parser.add_argument("--woq_enable_full_range", action="store_true")
+# =============WeightOnlyQuant GPTQ configs====================
+
+parser.add_argument("--gptq_actorder", action="store_true", help="Whether to apply the activation order GPTQ heuristic.")
+parser.add_argument('--gptq_percdamp', type=float, default=.01, help='Percent of the average Hessian diagonal to use for dampening.')
+parser.add_argument('--gptq_block_size', type=int, default=128, help='Block size. sub weight matrix size to run GPTQ.')
+parser.add_argument('--gptq_nsamples', type=int, default=128, help='Number of calibration data samples.')
+parser.add_argument('--gptq_use_max_length', action="store_true", help='Set all sequence length to be same length of args.gptq_pad_max_length')
+parser.add_argument('--gptq_pad_max_length', type=int, default=2048, help='Calibration dataset sequence max length, \
+                                                                           this should align with your model config, \
+                                                                           and your dataset builder args: args.pad_max_length')
+# ============BitsAndBytes configs==============
+parser.add_argument("--bitsandbytes", action="store_true")
+# =======================================
 args = parser.parse_args()
 
-calib_size = 1
+# transformers version >= 4.32.0 contained the mpt modeling definition.
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/mpt/modeling_mpt.py
+check_min_version("4.32.0")
 
-# model
+# get model config
 config = AutoConfig.from_pretrained(
       args.model,
       torchscript=True
-      if args.quantize
+      if (args.sq or args.woq_algo in ['AWQ', 'TEQ'])
       else False,  # torchscript will force `return_dict=False` to avoid jit errors
       use_cache=True, # to use kv cache.
       trust_remote_code=args.trust_remote_code,
       revision=args.revision,
       )
 
-# transformers version >= 4.32.0 contained the mpt modeling definition.
-# https://github.com/huggingface/transformers/blob/main/src/transformers/models/mpt/modeling_mpt.py
-if config.model_type == "mpt":
-    check_min_version("4.32.0")
 
 # tokenizer
 if config.model_type == "llama":
@@ -83,15 +100,14 @@
 else:
    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
 
-# quantize
-if args.quantize:
-    from intel_extension_for_transformers.transformers import (
-        AMPConfig,
-        WeightOnlyQuantizationConfig,
-        SmoothQuantConfig,
-        BitsAndBytesConfig
-
-    ) 
+# mixedprecision
+if args.mixed_precision:
+    mp_config = MixedPrecisionConfig(dtype="bfloat16") # default is bfloat16
+    user_model = AutoModelForCausalLM.from_pretrained(args.model,
+                                                quantization_config=mp_config
+                                               )
+# smoothquant
+elif args.sq:
     from intel_extension_for_transformers.transformers import AutoModelForCausalLM
     if re.search("gptj", config.model_type) or re.search(
         "gpt_neox", config.model_type
@@ -113,49 +129,40 @@
                                 op_type_dict=op_type_dict,  # default is {}
                                 excluded_precisions=excluded_precisions,  # default is []
                                )
-    # smooth-quant
-    q_model = AutoModelForCausalLM.from_pretrained(args.model,
+    user_model = AutoModelForCausalLM.from_pretrained(args.model,
                                                    quantization_config=sq_config
                                                )
-    print("sq done.")
-    # weight-only
-    woq_config = WeightOnlyQuantizationConfig(algorithm="RTN", # default is "RTN"
-                                              bits=8, # default is 8
-                                              group_size=-1, # default is -1
-                                              scheme="sym", # default is sym
-                                              enable_full_range=True # default is True
-                                              ) 
-    woq_model = AutoModelForCausalLM.from_pretrained(args.model,
+    config.save_pretrained(args.output_dir)
+    user_model.save(args.output_dir)
+# weight-only
+elif args.woq:
+    woq_config = WeightOnlyQuantConfig()
+    user_model = AutoModelForCausalLM.from_pretrained(args.model,
                                                 quantization_config=woq_config
                                             )
-    print("woq done.")
-    # amp
-    amp_config = AMPConfig(dtype="bfloat16") # default is bfloat16
-    amp_model = AutoModelForCausalLM.from_pretrained(args.model,
-                                                quantization_config=amp_config
-                                            )
-    print("amp done.")
-    # bitsandbytes
+# bitsandbytes
+elif args.bitsandbytes:
     bab_config = BitsAndBytesConfig()
-    bab_model = AutoModelForCausalLM.from_pretrained(args.model,
+    user_model = AutoModelForCausalLM.from_pretrained(args.model,
                                                 quantization_config=bab_config
                                             )
-    print("bitsandbytes done.")
-
+elif not args.int8 or args.int8_bf16_mixed:
+    user_model = AutoModelForCausalLM.from_pretrained(args.model, config=config)
+    # peft
+    if args.peft_model_id is not None:
+        from peft import PeftModel
+        user_model = PeftModel.from_pretrained(user_model, args.peft_model_id)
 
 # Generation
 generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=4)
 
+
 if args.int8 or args.int8_bf16_mixed:
     # TorchScript model don't attribute generate method, the wrapper is provided.
-    if args.ipex:
-        user_model = TSModelForCausalLM.from_pretrained(
-            args.output_dir, file_name="best_model.pt", trust_remote_code=args.trust_remote_code
-        )
-    else:
-        from neural_compressor.utils.pytorch import load
-
-        user_model = load(args.output_dir, user_model)
+    import intel_extension_for_pytorch as ipex
+    user_model = TSModelForCausalLM.from_pretrained(
+        args.output_dir, file_name="best_model.pt", trust_remote_code=args.trust_remote_code
+    )
 
 
 if args.benchmark:
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
index d5ad2705a7e..8000c6dcaf0 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
@@ -30,16 +30,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
 
-from transformers import AutoConfig, PretrainedConfig
-from transformers.dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
-from transformers.models.auto.modeling_auto import (MODEL_FOR_CAUSAL_LM_MAPPING,
-                                                    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-                                                    MODEL_MAPPING
-                                                    )
 
-from transformers.models.auto.auto_factory import _get_model_class
+import logging
+import torch
+import transformers
+
 from intel_extension_for_transformers.transformers.utils.utility import (
     LazyImport,
     generate_dummy_past_key_values,
@@ -57,86 +53,38 @@
 logger = logging.getLogger(__name__)
 torch = LazyImport("torch")
 
-
-class _BaseAutoModelClass:
-    # Base class for auto models.
-    _model_mapping = None
+class _BaseQBitsAutoModelClass:
+    ORIG_MODEL = None
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         import intel_extension_for_transformers.transformers.modeling.modeling_map
-        config = kwargs.pop("config", None)
+        load_in_8bit = kwargs.pop("load_in_8bit", False)
+        load_in_4bit = kwargs.pop("load_in_4bit", False)
         calib_func = kwargs.pop("calib_func", None)
-        trust_remote_code = kwargs.pop("trust_remote_code", None)
-        kwargs["_from_auto"] = True
-        hub_kwargs_names = [
-            "cache_dir",
-            "code_revision",
-            "force_download",
-            "local_files_only",
-            "proxies",
-            "resume_download",
-            "revision",
-            "subfolder",
-            "use_auth_token",
-        ]
-        hub_kwargs = {name: kwargs.pop(name) for name in hub_kwargs_names if name in kwargs}
-
-        if not isinstance(config, PretrainedConfig):
-            kwargs_orig = copy.deepcopy(kwargs)
-            # ensure not to pollute the config object with torch_dtype="auto" - since it's
-            # meaningless in the context of the config object - torch.dtype values are acceptable
-            if kwargs.get("torch_dtype", None) == "auto":
-                _ = kwargs.pop("torch_dtype")
-            # to not overwrite the quantization_config if config has a quantization_config
-
-            if kwargs.get("quantization_config", None) is not None:
-                _ = kwargs.pop("quantization_config")
-
-            config, kwargs = AutoConfig.from_pretrained(
-                pretrained_model_name_or_path,
-                return_unused_kwargs=True,
-                trust_remote_code=trust_remote_code,
-                **hub_kwargs,
-                **kwargs,
-            )
+        quantization_config = kwargs.pop("quantization_config", None)
+        if isinstance(quantization_config, MixedPrecisionConfig):
+            kwargs["torch_dtype"] = torch.bfloat16
+        if load_in_8bit or load_in_4bit or quantization_config is not None:
+            from intel_extension_for_transformers.llm.quantization.utils import convert_to_quantized_model, convert_dtype_2_str
+            torch_dtype = kwargs.pop("torch_dtype", torch.float32)
+        
+        if load_in_4bit:
+            if quantization_config is None:
+                quantization_config = WeightOnlyQuantConfig(compute_dtype=torch_dtype, weight_dtype="nf4")
+            else:
+                assert "4" in quantization_config.weight_dtype and quantization_config.compute_dtype == torch_dtype, \
+                f"Quantization_config.weight_dtype should be 'nf4', 'int4_fullrange', 'int4_clip',"
+                f"'fp4_e2m1' or 'fp4_e2m1_bnb' and compute_dtype should be {torch_dtype}."
+        elif load_in_8bit:
+            if quantization_config is None:
+                quantization_config = WeightOnlyQuantConfig(compute_dtype=torch_dtype, weight_dtype="int8")
+            else:
+                assert quantization_config.weight_dtype == "int8" \
+                    and quantization_config.compute_dtype == torch_dtype, \
+                        f"Quantization_config.weight_dtype should be 'int8' and compute_dtype should be {torch_dtype}."
 
-            # if torch_dtype=auto was passed here, ensure to pass it on
-            if kwargs_orig.get("torch_dtype", None) == "auto":
-                kwargs["torch_dtype"] = "auto"
-            quantization_config = kwargs_orig.get("quantization_config", None)
-            if quantization_config is not None and not (isinstance(quantization_config, SmoothQuantConfig) or 
-                                                        isinstance(quantization_config, MixedPrecisionConfig) or
-                                                        isinstance(quantization_config, WeightOnlyQuantConfig)
-                                                        ):
-                kwargs["quantization_config"] = kwargs_orig["quantization_config"]
-            if isinstance(quantization_config, MixedPrecisionConfig):
-                config.torch_dtype=torch.bfloat16
-
-        has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map
-        has_local_code = type(config) in cls._model_mapping.keys()
-        trust_remote_code = resolve_trust_remote_code(
-            trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code
-        )
-        if has_remote_code and trust_remote_code:
-            class_ref = config.auto_map[cls.__name__]
-            model_class = get_class_from_dynamic_module(
-                class_ref, pretrained_model_name_or_path, **hub_kwargs, **kwargs
-            )
-            _ = hub_kwargs.pop("code_revision", None)
-            model = model_class.from_pretrained(
-                pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
-            )
-        elif type(config) in cls._model_mapping.keys():
-            model_class = _get_model_class(config, cls._model_mapping)
-            model =  model_class.from_pretrained(
-                pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
-            )
-        else:
-            raise ValueError(
-                f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
-                f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
-            )
+        model = cls.ORIG_MODEL.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         model.eval()
         if isinstance(quantization_config, WeightOnlyQuantConfig):
             logger.info("Applying Weight Only Quantization.")
@@ -238,15 +186,17 @@ def default_calib_func(model):
                 model,
                 conf,
                 calib_func=calib_func
-            ).model
+            )
         return model
 
+class AutoModelForCausalLM(_BaseQBitsAutoModelClass):
+    ORIG_MODEL = transformers.AutoModelForCausalLM
+
+
+class AutoModel(_BaseQBitsAutoModelClass):
+    ORIG_MODEL = transformers.AutoModel
 
-class AutoModelForCausalLM(_BaseAutoModelClass):
-    _model_mapping = MODEL_FOR_CAUSAL_LM_MAPPING
 
-class AutoModel(_BaseAutoModelClass):
-    _model_mapping = MODEL_MAPPING
+class AutoModelForSeq2SeqLM(_BaseQBitsAutoModelClass):
+    ORIG_MODEL = transformers.AutoModelForSeq2SeqLM
 
-class AutoModelForSeq2SeqLM(_BaseAutoModelClass):
-    _model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING

From cc645cf71192c8e32cfa2117c979d30eacaa0300 Mon Sep 17 00:00:00 2001
From: "Wang, Chang" <chang1.wang@intel.com>
Date: Mon, 18 Sep 2023 19:04:55 +0800
Subject: [PATCH 15/19] Update README.md

---
 .../pytorch/text-generation/quantization/README.md            | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/huggingface/pytorch/text-generation/quantization/README.md b/examples/huggingface/pytorch/text-generation/quantization/README.md
index 204f31b69cc..f4ca1ec3be1 100644
--- a/examples/huggingface/pytorch/text-generation/quantization/README.md
+++ b/examples/huggingface/pytorch/text-generation/quantization/README.md
@@ -6,7 +6,7 @@ We provide the inference benchmarking script `run_generation.py` for [EleutherAI
 
 # Prerequisite​
 ## 1. Create Environment​
-Pytorch & Intel-extension-for-pytorch version 2.1 is required, the dependent packages are listed in requirements, we recommend create environment as the following steps.
+Pytorch and Intel-extension-for-pytorch version 2.1 are required, the dependent packages are listed in requirements, we recommend create environment as the following steps.
 
 ```bash
 conda create -n llm python=3.9 -
@@ -21,7 +21,7 @@ python setup.py install
 > Disable semi-compiler to avoid accuracy regression for mpt and neural-chat-v1-1 models, other > models don't need it.
 > `export _DNNL_DISABLE_COMPILER_BACKEND=1`
 
-> Note: if `ImportError: /lib64/libstdc++.so.6: version ``GLIBCXX_3.4.29`` not found` error raised when import intel-extension-for-pytorch, it is due to the high gcc library request, there is the solution to find the correct version.
+> Note: If `ImportError: /lib64/libstdc++.so.6: version ``GLIBCXX_3.4.29`` not found` error raised when import intel-extension-for-pytorch, it is due to the high gcc library request, there is the solution to find the correct version.
 > ```bash
 > find $CONDA_PREFIX | grep libstdc++.so.6
 > export LD_PRELOAD=<the path of libstdc++.so.6>:${LD_PRELOAD}

From 66ae9a887c3ed449e223d1ef2041535725c79e6e Mon Sep 17 00:00:00 2001
From: changwangss <chang1.wang@intel.com>
Date: Mon, 18 Sep 2023 04:15:11 -0700
Subject: [PATCH 16/19] fix long line

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 .../transformers/modeling/modeling_auto.py                      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
index 8000c6dcaf0..ce5e668b281 100644
--- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
+++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
@@ -66,7 +66,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         if isinstance(quantization_config, MixedPrecisionConfig):
             kwargs["torch_dtype"] = torch.bfloat16
         if load_in_8bit or load_in_4bit or quantization_config is not None:
-            from intel_extension_for_transformers.llm.quantization.utils import convert_to_quantized_model, convert_dtype_2_str
+            from intel_extension_for_transformers.llm.quantization.utils import convert_to_quantized_model
             torch_dtype = kwargs.pop("torch_dtype", torch.float32)
         
         if load_in_4bit:

From 509e132ba207605456170071b6f2a9591071ceba Mon Sep 17 00:00:00 2001
From: changwangss <chang1.wang@intel.com>
Date: Mon, 18 Sep 2023 05:32:53 -0700
Subject: [PATCH 17/19] fix import

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 .../huggingface/pytorch/text-generation/quantization/README.md  | 2 +-
 tests/requirements.txt                                          | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/huggingface/pytorch/text-generation/quantization/README.md b/examples/huggingface/pytorch/text-generation/quantization/README.md
index f4ca1ec3be1..67900340ef9 100644
--- a/examples/huggingface/pytorch/text-generation/quantization/README.md
+++ b/examples/huggingface/pytorch/text-generation/quantization/README.md
@@ -9,7 +9,7 @@ We provide the inference benchmarking script `run_generation.py` for [EleutherAI
 Pytorch and Intel-extension-for-pytorch version 2.1 are required, the dependent packages are listed in requirements, we recommend create environment as the following steps.
 
 ```bash
-conda create -n llm python=3.9 -
+conda create -n llm python=3.9 -y
 conda activate llm
 bash build_env.sh
 git clone https://github.com/intel/intel-extension-for-transformers.git
diff --git a/tests/requirements.txt b/tests/requirements.txt
index f988687f602..63ae3963112 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -18,3 +18,4 @@ evaluate
 wget
 optimum
 optimum-intel
+peft

From d48f055e129588f662c9f7824abadd70f30773c1 Mon Sep 17 00:00:00 2001
From: "Wang, Chang" <chang1.wang@intel.com>
Date: Mon, 18 Sep 2023 20:47:12 +0800
Subject: [PATCH 18/19] Update README.md

---
 .../pytorch/text-generation/quantization/README.md            | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/huggingface/pytorch/text-generation/quantization/README.md b/examples/huggingface/pytorch/text-generation/quantization/README.md
index 67900340ef9..8bee0702d41 100644
--- a/examples/huggingface/pytorch/text-generation/quantization/README.md
+++ b/examples/huggingface/pytorch/text-generation/quantization/README.md
@@ -49,7 +49,7 @@ OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python ru
     --mixed_precision \
     --benchmark
 # smoothquant
-# --int8_bf16_mixed is used for int8 mixed bfloat16 precision.
+# [alternative] --int8 is used for int8 only, --int8_bf16_mixed is used for int8 mixed bfloat16 precision.
 OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python run_generation.py \
     --model EleutherAI/gpt-j-6b \
     --sq \
@@ -83,7 +83,7 @@ python run_generation.py \
     --accuracy \
     --tasks "lambada_openai"
 # smoothquant
-# --int8_bf16_mixed is used for int8 mixed bfloat16 precision.
+# [alternative] --int8 is used for int8 only, --int8_bf16_mixed is used for int8 mixed bfloat16 precision.
 python run_generation.py \
     --model EleutherAI/gpt-j-6b \
     --sq \

From 3b4080b1ddc97f0a108d5da713920d9524b79be0 Mon Sep 17 00:00:00 2001
From: "Wang, Chang" <chang1.wang@intel.com>
Date: Mon, 18 Sep 2023 22:12:18 +0800
Subject: [PATCH 19/19] Update test_quantization.py

---
 tests/test_quantization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_quantization.py b/tests/test_quantization.py
index c9d027ea8d2..a1c0988560d 100644
--- a/tests/test_quantization.py
+++ b/tests/test_quantization.py
@@ -308,7 +308,7 @@ def test_quantization_for_llm(self):
         q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                                     quantization_config=sq_config
                                                 )
-        self.assertTrue(isinstance(q_model, torch.jit.ScriptModule))
+        self.assertTrue(isinstance(q_model.model, torch.jit.ScriptModule))
         # weight-only
         woq_config = WeightOnlyQuantConfig()
         woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path,