From 6068d069964cecb9919dec289d740e056fbd6a3f Mon Sep 17 00:00:00 2001 From: changwangss Date: Thu, 14 Sep 2023 04:00:50 -0700 Subject: [PATCH 01/19] enable text-generation with NeuralChat API Signed-off-by: changwangss --- .../quantization/run_generation.py | 152 ++---------------- .../llm/quantization/optimization.py | 144 ++++++++++++++++- .../neural_chat/config.py | 8 + 3 files changed, 164 insertions(+), 140 deletions(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py index 4774380dc58..6d7ed135f9d 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py @@ -17,6 +17,9 @@ import numpy as np from itertools import chain from optimum.utils import NormalizedConfigManager +# ipex dependency +import intel_extension_for_pytorch as ipex +from optimum.intel.generation.modeling import TSModelForCausalLM parser = argparse.ArgumentParser() @@ -58,6 +61,7 @@ parser.add_argument("--tasks", nargs='+', default=["winogrande", "copa", "piqa", "rte", "hellaswag", \ "openbookqa", "lambada_openai", "lambada_standard", "wikitext"], type=str, \ help="tasks list for accuracy validation") + args = parser.parse_args() calib_size = 1 @@ -82,6 +86,7 @@ config=config ) +# tokenizer if config.model_type == "llama": from transformers import LlamaTokenizer tokenizer = LlamaTokenizer.from_pretrained(args.model) @@ -92,125 +97,10 @@ user_model = user_model.to(memory_format=torch.channels_last) user_model.eval() -if args.ipex: - import intel_extension_for_pytorch as ipex - from optimum.intel.generation.modeling import TSModelForCausalLM - # quantize if args.quantize: - def generate_dummy_past_key_values(input_bs, user_model): - normalized_config = NormalizedConfigManager.get_normalized_config_class( - user_model.config.model_type - )(user_model.config) - nb_pkv = 2 - num_layers = normalized_config.num_layers - num_attention_heads = normalized_config.num_attention_heads - hidden_size = normalized_config.hidden_size - d_k = hidden_size // num_attention_heads - - if user_model.config.model_type == "bloom": - pkv = () - for nb_pkv in range(nb_pkv): - if nb_pkv % 2 == 0: - new_shape = [input_bs * num_attention_heads, d_k, 1] - else: - new_shape = [input_bs * num_attention_heads, 1, d_k] - pkv = pkv + (torch.ones(size=new_shape),) - else: - new_shape = [input_bs, num_attention_heads, 1, d_k] - dummy_tensor = torch.ones(size=new_shape) - pkv = tuple(dummy_tensor for _ in range(nb_pkv)) - past_key_values = tuple(tuple(pkv) for _ in range(num_layers)) - return past_key_values - - class Evaluator: - def __init__( - self, - dataset, - tokenizer, - batch_size=8, - pad_val=1, - pad_max=512, - is_calib=False, - ): - self.dataset = dataset - self.tokenizer = tokenizer - self.batch_size = batch_size - self.pad_val = pad_val - self.pad_max = pad_max - self.is_calib = is_calib - - # tokenize the dataset - self.dataset = self.dataset.map(self.tokenize_function, batched=True) - self.dataset.set_format(type="torch", columns=["input_ids"]) - - @torch.no_grad() - def tokenize_function(self, examples): - example = self.tokenizer(examples["text"]) - return example - - @torch.no_grad() - def collate_batch(self, batch): - input_ids_padded = [] - last_ind = [] - for text in batch: - input_ids = text["input_ids"] - pad_len = self.pad_max - input_ids.shape[0] - last_ind.append(input_ids.shape[0] - 1) - if self.is_calib: - input_ids = ( - input_ids[: self.pad_max] - if len(input_ids) > self.pad_max - else input_ids - ) - else: - input_ids = pad(input_ids, (0, pad_len), value=self.pad_val) - input_ids_padded.append(input_ids) - return ( - torch.vstack(input_ids_padded), - torch.tensor(last_ind), - ) - - calib_dataset = load_dataset(args.dataset, split="train") - calib_dataset = calib_dataset.shuffle(seed=42) - calib_evaluator = Evaluator( - calib_dataset, - tokenizer, - args.batch_size, - pad_max=args.pad_max_length, - is_calib=True, - ) - calib_dataloader = DataLoader( - calib_evaluator.dataset, - batch_size=calib_size, - shuffle=False, - collate_fn=calib_evaluator.collate_batch, - ) - input_ids = user_model.dummy_inputs["input_ids"] - input_bs, input_len = input_ids.shape - past_key_values = generate_dummy_past_key_values(input_bs, user_model) - attention_mask = torch.ones(input_bs, input_len + 1) - attention_mask[:,0] = 0 - example_inputs = (input_ids, tuple(past_key_values), attention_mask) - # do inference to check example_inputs formats - user_model(*example_inputs) - - def calib_func(prepared_model): - for i, (input_ids, last_ind) in enumerate(calib_dataloader): - input_bs, input_len = input_ids.shape - past_key_values = generate_dummy_past_key_values(input_bs, user_model) - attention_mask = torch.ones(input_bs, input_len + 1) - attention_mask[:,0] = 0 - if i >= args.calib_iters: - break - prepared_model( - input_ids=input_ids, - past_key_values=past_key_values, - attention_mask=attention_mask, - ) - - from neural_compressor import PostTrainingQuantConfig, quantization - + from intel_extension_for_transformers.neural_chat.config import SmoothQuantConfig + from intel_extension_for_transformers.llm.quantization.optimization import Optimization if re.search("gptj", user_model.config.model_type) or re.search( "gpt_neox", user_model.config.model_type ): @@ -225,30 +115,18 @@ def calib_func(prepared_model): else: op_type_dict = {} excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"] - if args.sq: - args.alpha = args.alpha if args.alpha == "auto" else float(args.alpha) - recipes = {"smooth_quant": True, "smooth_quant_args": {"alpha": args.alpha}} - conf = PostTrainingQuantConfig( - backend="ipex" if args.ipex else "default", - excluded_precisions=excluded_precisions, - op_type_dict=op_type_dict, - recipes=recipes, - example_inputs=example_inputs, - ) - else: - conf = PostTrainingQuantConfig( - backend="ipex" if args.ipex else "default", - excluded_precisions=excluded_precisions, - op_type_dict=op_type_dict, - example_inputs=example_inputs, - ) + config = SmoothQuantConfig(alpha=float(args.alpha), + op_type_dict=op_type_dict, + excluded_precisions=excluded_precisions + ) # save config user_model.config.save_pretrained(args.output_dir) - q_model = quantization.fit( + optimization = Optimization(config) + q_model = optimization.optimize( user_model, - conf, - calib_func=calib_func, + tokenizer ) + # save model q_model.save(args.output_dir) # Generation diff --git a/intel_extension_for_transformers/llm/quantization/optimization.py b/intel_extension_for_transformers/llm/quantization/optimization.py index 5db84390df2..52d5695b03d 100644 --- a/intel_extension_for_transformers/llm/quantization/optimization.py +++ b/intel_extension_for_transformers/llm/quantization/optimization.py @@ -15,21 +15,31 @@ # See the License for the specific language governing permissions and # limitations under the License. +from doctest import Example from typing import Union +from venv import logger +from intel_extension_for_transformers.transformers.utils.utility import LazyImport from intel_extension_for_transformers.neural_chat.config import ( AMPConfig, WeightOnlyQuantizationConfig, - BitsAndBytesConfig + BitsAndBytesConfig, + SmoothQuantConfig ) +import logging +logger = logging.getLogger(__name__) +torch = LazyImport("torch") class Optimization: def __init__( self, - optimization_config: Union[AMPConfig, WeightOnlyQuantizationConfig, BitsAndBytesConfig] + optimization_config: Union[AMPConfig, WeightOnlyQuantizationConfig, BitsAndBytesConfig, SmoothQuantConfig] ): self.optimization_config = optimization_config - def optimize(self, model): + def optimize(self, model, tokenizer=None, calib_func=None): + """ + Optimize the model with a given config. + """ optimized_model = model config = self.optimization_config if isinstance(config, WeightOnlyQuantizationConfig): @@ -55,4 +65,132 @@ def optimize(self, model): model, conf, ).model + elif isinstance(config, SmoothQuantConfig): + print("Applying SmoothQuant.") + if tokenizer is None: + logger.error("Please provide the tokenizer. \n" + + "from transformer import AutoTokenizer \n" + + "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n" + + "Or provide calib_func directly." + ) + if calib_func is None: + from datasets import load_dataset + from torch.utils.data import DataLoader + calib_dataset = load_dataset("NeelNanda/pile-10k", split="train") + calib_dataset = calib_dataset.shuffle(seed=42) + + def tokenize_function(examples): + return tokenizer(examples["text"]) + + tokenized_dataset = calib_dataset.map(tokenize_function, batched=True) + tokenized_dataset.set_format(type="torch", columns=["input_ids"]) + + def collate_batch(batch): + input_ids_padded = [] + for text in batch: + input_ids = text["input_ids"] + input_ids = ( + input_ids[: 512] + if len(input_ids) > 512 + else input_ids + ) + input_ids_padded.append(input_ids) + return (torch.vstack(input_ids_padded)) + calib_dataloader = DataLoader( + tokenized_dataset, + batch_size=1, + shuffle=False, + collate_fn=collate_batch, + ) + def default_calib_func(model): + """ + This is the default calibration function, the dataset is NeelNanda/pile-10k, + the default calib_iters is 100. + """ + + for i, (input_ids) in enumerate(calib_dataloader): + input_bs, input_len = input_ids.shape + past_key_values = self.generate_dummy_past_key_values(input_bs, model) + attention_mask = torch.ones(input_bs, input_len + 1) + attention_mask[:,0] = 0 + if i >= 100: + break + model( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + ) + recipes = {"smooth_quant": True, "smooth_quant_args": {"alpha": config.alpha}} + example_inputs = self.get_example_inputs_for_trace(model) + from neural_compressor import PostTrainingQuantConfig, quantization + conf = PostTrainingQuantConfig( + backend="ipex", + excluded_precisions=config.excluded_precisions, + op_type_dict=config.op_type_dict, + recipes=recipes, + example_inputs=example_inputs, + ) + if calib_func is None: + logger.info("The default calibration funcation is used, " + + "the calibration dataset is NeelNanda/pile-10k," + + "batchsize is 1 and calibration iteration is 100.") + calib_func = default_calib_func + else: + calib_func = calib_func + optimized_model = quantization.fit( + model, + conf, + calib_func=calib_func, + ) return optimized_model + + def generate_dummy_past_key_values(self, input_bs, model): + """ + Generate the dummy past_key_values. + """ + from optimum.utils import NormalizedConfigManager + normalized_config = NormalizedConfigManager.get_normalized_config_class( + model.config.model_type + )(model.config) + nb_pkv = 2 + num_layers = normalized_config.num_layers + num_attention_heads = normalized_config.num_attention_heads + hidden_size = normalized_config.hidden_size + d_k = hidden_size // num_attention_heads + + if model.config.model_type == "bloom": + pkv = () + for nb_pkv in range(nb_pkv): + if nb_pkv % 2 == 0: + new_shape = [input_bs * num_attention_heads, d_k, 1] + else: + new_shape = [input_bs * num_attention_heads, 1, d_k] + pkv = pkv + (torch.ones(size=new_shape),) + else: + new_shape = [input_bs, num_attention_heads, 1, d_k] + dummy_tensor = torch.ones(size=new_shape) + pkv = tuple(dummy_tensor for _ in range(nb_pkv)) + past_key_values = tuple(tuple(pkv) for _ in range(num_layers)) + return past_key_values + + def get_example_inputs_for_trace(self, model, return_type="tuple"): + """ + Generate the example_input for tracing, support models load from AutoModelForCausalLM. + + """ + input_ids = model.dummy_inputs["input_ids"] + input_bs, input_len = input_ids.shape + past_key_values = self.generate_dummy_past_key_values(input_bs, model) + attention_mask = torch.ones(input_bs, input_len + 1) + attention_mask[:,0] = 0 + example_inputs = (input_ids, tuple(past_key_values), attention_mask) + # do inference to check example_inputs formats + model(*example_inputs) + if return_type != "tuple": + example_inputs = { + "input_ids": input_ids, + "past_key_values": tuple(past_key_values), + "attention_mask": attention_mask + } + return example_inputs + diff --git a/intel_extension_for_transformers/neural_chat/config.py b/intel_extension_for_transformers/neural_chat/config.py index 7248dc0bcb7..e8654d22978 100644 --- a/intel_extension_for_transformers/neural_chat/config.py +++ b/intel_extension_for_transformers/neural_chat/config.py @@ -417,6 +417,14 @@ class WeightOnlyQuantizationConfig: @dataclass class AMPConfig: dtype: str = 'bfloat16' + op_type_dict = None + + +@dataclass +class SmoothQuantConfig: + alpha: float = 0.5 + op_type_dict: dict = None + excluded_precisions: dict = None class PipelineConfig: def __init__(self, From fccc16ade37219a55a062b5e33c07e2e50f39b74 Mon Sep 17 00:00:00 2001 From: changwangss Date: Thu, 14 Sep 2023 04:28:49 -0700 Subject: [PATCH 02/19] fix wrong typing and hide import Signed-off-by: changwangss --- .../text-generation/quantization/run_generation.py | 7 +------ .../llm/quantization/optimization.py | 10 ++++++---- intel_extension_for_transformers/neural_chat/config.py | 4 +--- 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py index 6d7ed135f9d..2a28ae127f3 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py @@ -17,8 +17,6 @@ import numpy as np from itertools import chain from optimum.utils import NormalizedConfigManager -# ipex dependency -import intel_extension_for_pytorch as ipex from optimum.intel.generation.modeling import TSModelForCausalLM @@ -37,13 +35,10 @@ ) parser.add_argument("--output_dir", nargs="?", default="./saved_results") parser.add_argument("--quantize", action="store_true") -parser.add_argument("--ipex", action="store_true") -parser.add_argument("--sq", action="store_true") parser.add_argument("--alpha", default="auto", help="Smooth quant parameter.") parser.add_argument( "--pad_max_length", default=512, type=int, help="Pad input ids to max length." ) -parser.add_argument("--calib_iters", default=512, type=int, help="calibration iters.") parser.add_argument("--int8", action="store_true") parser.add_argument( "--int8_bf16_mixed", @@ -70,7 +65,7 @@ config = AutoConfig.from_pretrained( args.model, torchscript=True - if args.ipex + if args.quantize else False, # torchscript will force `return_dict=False` to avoid jit errors use_cache=True, # to use kv cache. trust_remote_code=args.trust_remote_code, diff --git a/intel_extension_for_transformers/llm/quantization/optimization.py b/intel_extension_for_transformers/llm/quantization/optimization.py index 52d5695b03d..e0b400c3499 100644 --- a/intel_extension_for_transformers/llm/quantization/optimization.py +++ b/intel_extension_for_transformers/llm/quantization/optimization.py @@ -67,12 +67,14 @@ def optimize(self, model, tokenizer=None, calib_func=None): ).model elif isinstance(config, SmoothQuantConfig): print("Applying SmoothQuant.") + import intel_extension_for_pytorch if tokenizer is None: - logger.error("Please provide the tokenizer. \n" + - "from transformer import AutoTokenizer \n" + - "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n" + - "Or provide calib_func directly." + logger.error("Please provide the tokenizer or provide calib_func directly," + + " the following is how to get tokenizer. \n" + + " from transformer import AutoTokenizer \n" + + " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n" ) + exit(0) if calib_func is None: from datasets import load_dataset from torch.utils.data import DataLoader diff --git a/intel_extension_for_transformers/neural_chat/config.py b/intel_extension_for_transformers/neural_chat/config.py index e8654d22978..13e0c3494db 100644 --- a/intel_extension_for_transformers/neural_chat/config.py +++ b/intel_extension_for_transformers/neural_chat/config.py @@ -416,9 +416,7 @@ class WeightOnlyQuantizationConfig: @dataclass class AMPConfig: - dtype: str = 'bfloat16' - op_type_dict = None - + dtype: str = 'bfloat16' @dataclass class SmoothQuantConfig: From 4e989a506be26ac6ff71b69a4b1948e314cb01dc Mon Sep 17 00:00:00 2001 From: "Wang, Chang" Date: Thu, 14 Sep 2023 19:36:09 +0800 Subject: [PATCH 03/19] improve import check --- .../llm/quantization/optimization.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/intel_extension_for_transformers/llm/quantization/optimization.py b/intel_extension_for_transformers/llm/quantization/optimization.py index e0b400c3499..07796efa16c 100644 --- a/intel_extension_for_transformers/llm/quantization/optimization.py +++ b/intel_extension_for_transformers/llm/quantization/optimization.py @@ -26,6 +26,7 @@ SmoothQuantConfig ) import logging +import warnings logger = logging.getLogger(__name__) torch = LazyImport("torch") @@ -67,7 +68,12 @@ def optimize(self, model, tokenizer=None, calib_func=None): ).model elif isinstance(config, SmoothQuantConfig): print("Applying SmoothQuant.") - import intel_extension_for_pytorch + try: + import intel_extension_for_pytorch as ipex + except ImportError: + warnings.warn( + "Please install Intel Extension for PyTorch to accelerate the model inference." + ) if tokenizer is None: logger.error("Please provide the tokenizer or provide calib_func directly," + " the following is how to get tokenizer. \n" + From 40d20eb94a223bd914bd1898dea1d0f8e53e688a Mon Sep 17 00:00:00 2001 From: changwangss Date: Fri, 15 Sep 2023 01:11:16 -0700 Subject: [PATCH 04/19] rebase main Signed-off-by: changwangss --- .../quantization/run_generation.py | 95 ++++--- .../transformers/__init__.py | 36 +-- .../transformers/modeling/modeling_causal.py | 250 ++++++++++++++++++ .../transformers/utils/__init__.py | 9 +- .../transformers/utils/utility.py | 51 ++++ 5 files changed, 377 insertions(+), 64 deletions(-) create mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_causal.py diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py index 2a28ae127f3..b0529869455 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py @@ -12,7 +12,6 @@ from torch.utils.data import DataLoader from transformers import AutoConfig, AutoTokenizer, PretrainedConfig from transformers.utils import check_min_version -from intel_extension_for_transformers.transformers import AutoModelForCausalLM import transformers import numpy as np from itertools import chain @@ -63,46 +62,44 @@ # model config = AutoConfig.from_pretrained( - args.model, - torchscript=True - if args.quantize - else False, # torchscript will force `return_dict=False` to avoid jit errors - use_cache=True, # to use kv cache. - trust_remote_code=args.trust_remote_code, - revision=args.revision - ) + args.model, + torchscript=True + if args.quantize + else False, # torchscript will force `return_dict=False` to avoid jit errors + use_cache=True, # to use kv cache. + trust_remote_code=args.trust_remote_code, + revision=args.revision, + ) + # transformers version >= 4.32.0 contained the mpt modeling definition. # https://github.com/huggingface/transformers/blob/main/src/transformers/models/mpt/modeling_mpt.py if config.model_type == "mpt": check_min_version("4.32.0") -user_model = AutoModelForCausalLM.from_pretrained( - args.model, - config=config -) - # tokenizer if config.model_type == "llama": - from transformers import LlamaTokenizer - tokenizer = LlamaTokenizer.from_pretrained(args.model) + from transformers import LlamaTokenizer + tokenizer = LlamaTokenizer.from_pretrained(args.model) else: - tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code) - -# to channels last -user_model = user_model.to(memory_format=torch.channels_last) -user_model.eval() + tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code) # quantize if args.quantize: - from intel_extension_for_transformers.neural_chat.config import SmoothQuantConfig - from intel_extension_for_transformers.llm.quantization.optimization import Optimization - if re.search("gptj", user_model.config.model_type) or re.search( - "gpt_neox", user_model.config.model_type + from intel_extension_for_transformers.transformers import ( + AMPConfig, + WeightOnlyQuantizationConfig, + SmoothQuantConfig, + BitsAndBytesConfig + + ) + from intel_extension_for_transformers.transformers import AutoModelForCausalLM + if re.search("gptj", config.model_type) or re.search( + "gpt_neox", config.model_type ): op_type_dict = { "add": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}}, } - elif re.search("mpt", user_model.config.model_type): + elif re.search("mpt", config.model_type): op_type_dict = { "add": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}}, "":{"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}}, @@ -110,19 +107,41 @@ else: op_type_dict = {} excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"] - config = SmoothQuantConfig(alpha=float(args.alpha), - op_type_dict=op_type_dict, - excluded_precisions=excluded_precisions + sq_config = SmoothQuantConfig( + tokenizer=tokenizer, # either two of one, tokenizer or calib_func + alpha=float(args.alpha), # default is 0.5 + op_type_dict=op_type_dict, # default is {} + excluded_precisions=excluded_precisions, # default is [] ) - # save config - user_model.config.save_pretrained(args.output_dir) - optimization = Optimization(config) - q_model = optimization.optimize( - user_model, - tokenizer - ) - # save model - q_model.save(args.output_dir) + # smooth-quant + q_model = AutoModelForCausalLM.from_pretrained(args.model, + quantization_config=sq_config + ) + print("sq done.") + # weight-only + woq_config = WeightOnlyQuantizationConfig(algorithm="RTN", # default is "RTN" + bits=8, # default is 8 + group_size=-1, # default is -1 + scheme="sym", # default is sym + enable_full_range=True # default is True + ) + woq_model = AutoModelForCausalLM.from_pretrained(args.model, + quantization_config=woq_config + ) + print("woq done.") + # amp + amp_config = AMPConfig(dtype="bfloat16") # default is bfloat16 + amp_model = AutoModelForCausalLM.from_pretrained(args.model, + quantization_config=amp_config + ) + print("amp done.") + # bitsandbytes + bab_config = BitsAndBytesConfig() + bab_model = AutoModelForCausalLM.from_pretrained(args.model, + quantization_config=bab_config + ) + print("bitsandbytes done.") + # Generation generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=4) diff --git a/intel_extension_for_transformers/transformers/__init__.py b/intel_extension_for_transformers/transformers/__init__.py index d78c4288971..fbec8b11cb7 100644 --- a/intel_extension_for_transformers/transformers/__init__.py +++ b/intel_extension_for_transformers/transformers/__init__.py @@ -16,33 +16,19 @@ # limitations under the License. -from .config import ( - AutoDistillationConfig, - DistillationConfig, - FlashDistillationConfig, - TFDistillationConfig, - NASConfig, - Provider, - PruningConfig, - QuantizationConfig, - WEIGHTS_NAME, - DynamicLengthConfig, - BenchmarkConfig, - PrunerV2, - -) -from .distillation import ( - DistillationCriterionMode, - SUPPORTED_DISTILLATION_CRITERION_MODE, -) -from .modeling import OptimizedModel, AutoModelForCausalLM +from .config import (WEIGHTS_NAME, AutoDistillationConfig, BenchmarkConfig, + DistillationConfig, DynamicLengthConfig, + FlashDistillationConfig, NASConfig, Provider, PrunerV2, + PruningConfig, QuantizationConfig, TFDistillationConfig) +from .distillation import (SUPPORTED_DISTILLATION_CRITERION_MODE, + DistillationCriterionMode) from .mixture.auto_distillation import AutoDistillation +from .modeling import AutoModelForCausalLM, OptimizedModel from .nas import NAS from .optimizer import NoTrainerOptimizer, Orchestrate_optimizer from .optimizer_tf import TFOptimization -from .pruning import PrunerConfig, PruningMode, SUPPORTED_PRUNING_MODE -from .quantization import QuantizationMode, SUPPORTED_QUANT_MODE -from .utils import metrics -from .utils import objectives +from .pruning import SUPPORTED_PRUNING_MODE, PrunerConfig, PruningMode +from .quantization import SUPPORTED_QUANT_MODE, QuantizationMode +from .utils import (AMPConfig, BitsAndBytesConfig, SmoothQuantConfig, + WeightOnlyQuantizationConfig, metrics, objectives) from .utils.utility import LazyImport - diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_causal.py b/intel_extension_for_transformers/transformers/modeling/modeling_causal.py new file mode 100644 index 00000000000..e090d092ad2 --- /dev/null +++ b/intel_extension_for_transformers/transformers/modeling/modeling_causal.py @@ -0,0 +1,250 @@ +# !/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# coding=utf-8 +# Copyright 2021 The EleutherAI and HuggingFace Teams. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy + +from transformers import AutoConfig, PretrainedConfig +from transformers.dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code +from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING +from transformers.models.auto.auto_factory import _get_model_class +from intel_extension_for_transformers.transformers.utils.utility import ( + LazyImport, + generate_dummy_past_key_values, + get_example_inputs_for_trace +) + + +from intel_extension_for_transformers.transformers import ( + AMPConfig, + WeightOnlyQuantizationConfig, + SmoothQuantConfig +) +import logging +import warnings +logger = logging.getLogger(__name__) +torch = LazyImport("torch") + + +class _BaseAutoModelClass: + # Base class for auto models. + _model_mapping = None + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + import intel_extension_for_transformers.transformers.modeling.modeling_map + config = kwargs.pop("config", None) + calib_func = kwargs.pop("calib_func", None) + trust_remote_code = kwargs.pop("trust_remote_code", None) + kwargs["_from_auto"] = True + hub_kwargs_names = [ + "cache_dir", + "code_revision", + "force_download", + "local_files_only", + "proxies", + "resume_download", + "revision", + "subfolder", + "use_auth_token", + ] + hub_kwargs = {name: kwargs.pop(name) for name in hub_kwargs_names if name in kwargs} + + if not isinstance(config, PretrainedConfig): + kwargs_orig = copy.deepcopy(kwargs) + # ensure not to pollute the config object with torch_dtype="auto" - since it's + # meaningless in the context of the config object - torch.dtype values are acceptable + if kwargs.get("torch_dtype", None) == "auto": + _ = kwargs.pop("torch_dtype") + # to not overwrite the quantization_config if config has a quantization_config + + if kwargs.get("quantization_config", None) is not None: + _ = kwargs.pop("quantization_config") + + config, kwargs = AutoConfig.from_pretrained( + pretrained_model_name_or_path, + return_unused_kwargs=True, + trust_remote_code=trust_remote_code, + **hub_kwargs, + **kwargs, + ) + + # if torch_dtype=auto was passed here, ensure to pass it on + if kwargs_orig.get("torch_dtype", None) == "auto": + kwargs["torch_dtype"] = "auto" + quantization_config = kwargs_orig.get("quantization_config", None) + if quantization_config is not None and not (isinstance(quantization_config, SmoothQuantConfig) or + isinstance(quantization_config, AMPConfig) or + isinstance(quantization_config, WeightOnlyQuantizationConfig) + ): + kwargs["quantization_config"] = kwargs_orig["quantization_config"] + if isinstance(quantization_config, AMPConfig): + config.torch_dtype=torch.bfloat16 + + has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map + has_local_code = type(config) in cls._model_mapping.keys() + trust_remote_code = resolve_trust_remote_code( + trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code + ) + if has_remote_code and trust_remote_code: + class_ref = config.auto_map[cls.__name__] + model_class = get_class_from_dynamic_module( + class_ref, pretrained_model_name_or_path, **hub_kwargs, **kwargs + ) + _ = hub_kwargs.pop("code_revision", None) + model = model_class.from_pretrained( + pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs + ) + elif type(config) in cls._model_mapping.keys(): + model_class = _get_model_class(config, cls._model_mapping) + model = model_class.from_pretrained( + pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs + ) + else: + raise ValueError( + f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n" + f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}." + ) + model.eval() + if isinstance(quantization_config, WeightOnlyQuantizationConfig): + logger.info("Applying Weight Only Quantization.") + from neural_compressor import PostTrainingQuantConfig, quantization + op_type_dict = { + '.*':{ # re.match + "weight": { + 'bits': quantization_config.bits, # 1-8 bits + 'group_size': quantization_config.group_size, # -1 (per-channel) + 'scheme': quantization_config.scheme, # sym/asym + 'algorithm': quantization_config.algorithm, # RTN/AWQ/TEQ + }, + }, + } + recipes = {"rtn_args": {"enable_full_range": quantization_config.enable_full_range}} + conf = PostTrainingQuantConfig( + approach='weight_only', + op_type_dict=op_type_dict, + recipes=recipes, + ) + model.config.torchscript = True + model = quantization.fit( + model, + conf, + ).model + elif isinstance(quantization_config, SmoothQuantConfig): + logger.info("Applying SmoothQuant.") + try: + import intel_extension_for_pytorch as ipex + except ImportError: + warnings.warn( + "Please install Intel Extension for PyTorch to accelerate the model inference." + ) + if quantization_config.tokenizer is None: + logger.error("Please provide the tokenizer or provide calib_func directly," + + " the following is how to get tokenizer. \n" + + " from transformer import AutoTokenizer \n" + + " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n" + ) + exit(0) + if calib_func is None: + from datasets import load_dataset + from torch.utils.data import DataLoader + calib_dataset = load_dataset("NeelNanda/pile-10k", split="train") + calib_dataset = calib_dataset.shuffle(seed=42) + + def tokenize_function(examples): + return quantization_config.tokenizer(examples["text"]) + + tokenized_dataset = calib_dataset.map(tokenize_function, batched=True) + tokenized_dataset.set_format(type="torch", columns=["input_ids"]) + + def collate_batch(batch): + input_ids_padded = [] + for text in batch: + input_ids = text["input_ids"] + input_ids = ( + input_ids[: 512] + if len(input_ids) > 512 + else input_ids + ) + input_ids_padded.append(input_ids) + return (torch.vstack(input_ids_padded)) + calib_dataloader = DataLoader( + tokenized_dataset, + batch_size=1, + shuffle=False, + collate_fn=collate_batch, + ) + def default_calib_func(model): + """ + This is the default calibration function, the dataset is NeelNanda/pile-10k, + the default calib_iters is 100. + """ + + for i, (input_ids) in enumerate(calib_dataloader): + input_bs, input_len = input_ids.shape + past_key_values = generate_dummy_past_key_values(input_bs, model) + attention_mask = torch.ones(input_bs, input_len + 1) + attention_mask[:,0] = 0 + if i >= 100: + break + model( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + ) + recipes = {"smooth_quant": True, "smooth_quant_args": {"alpha": quantization_config.alpha}} + example_inputs = get_example_inputs_for_trace(model) + from neural_compressor import PostTrainingQuantConfig, quantization + conf = PostTrainingQuantConfig( + backend="ipex", + excluded_precisions=quantization_config.excluded_precisions, + op_type_dict=quantization_config.op_type_dict, + recipes=recipes, + example_inputs=example_inputs, + ) + if calib_func is None: + logger.info("The default calibration funcation is used, " + + "the calibration dataset is NeelNanda/pile-10k," + + "batchsize is 1 and calibration iteration is 100.") + calib_func = default_calib_func + else: + calib_func = calib_func + model.config.torchscript = True + model = quantization.fit( + model, + conf, + calib_func=calib_func + ).model + return model + + +class AutoModelForCausalLM(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_CAUSAL_LM_MAPPING \ No newline at end of file diff --git a/intel_extension_for_transformers/transformers/utils/__init__.py b/intel_extension_for_transformers/transformers/utils/__init__.py index fa9f139b97d..1b574a0d3bf 100644 --- a/intel_extension_for_transformers/transformers/utils/__init__.py +++ b/intel_extension_for_transformers/transformers/utils/__init__.py @@ -15,4 +15,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Utils for optimization.""" \ No newline at end of file +"""Utils for optimization.""" + +from .quantization_config import ( + AMPConfig, + BitsAndBytesConfig, + SmoothQuantConfig, + WeightOnlyQuantizationConfig, +) diff --git a/intel_extension_for_transformers/transformers/utils/utility.py b/intel_extension_for_transformers/transformers/utils/utility.py index b7572fe69c4..82030c4a751 100644 --- a/intel_extension_for_transformers/transformers/utils/utility.py +++ b/intel_extension_for_transformers/transformers/utils/utility.py @@ -30,6 +30,7 @@ DECODER_WITH_PAST_NAME = "decoder_with_past_model.bin" WEIGHTS_NAME = "pytorch_model.bin" +torch = LazyImport("torch") def distributed_init(backend="gloo", world_size=1, rank=-1, init_method=None, master_addr='127.0.0.1', master_port='12345'): @@ -72,3 +73,53 @@ def __init__(self) -> None: self.batch_size = dataloader.total_batch_size self.dataset = dataloader.dataset return INCDataLoader() + +def generate_dummy_past_key_values(input_bs, model): + """ + Generate the dummy past_key_values. + """ + from optimum.utils import NormalizedConfigManager + normalized_config = NormalizedConfigManager.get_normalized_config_class( + model.config.model_type + )(model.config) + nb_pkv = 2 + num_layers = normalized_config.num_layers + num_attention_heads = normalized_config.num_attention_heads + hidden_size = normalized_config.hidden_size + d_k = hidden_size // num_attention_heads + + if model.config.model_type == "bloom": + pkv = () + for nb_pkv in range(nb_pkv): + if nb_pkv % 2 == 0: + new_shape = [input_bs * num_attention_heads, d_k, 1] + else: + new_shape = [input_bs * num_attention_heads, 1, d_k] + pkv = pkv + (torch.ones(size=new_shape),) + else: + new_shape = [input_bs, num_attention_heads, 1, d_k] + dummy_tensor = torch.ones(size=new_shape) + pkv = tuple(dummy_tensor for _ in range(nb_pkv)) + past_key_values = tuple(tuple(pkv) for _ in range(num_layers)) + return past_key_values + +def get_example_inputs_for_trace(model, return_type="tuple"): + """ + Generate the example_input for tracing, support models load from AutoModelForCausalLM. + + """ + input_ids = model.dummy_inputs["input_ids"] + input_bs, input_len = input_ids.shape + past_key_values = generate_dummy_past_key_values(input_bs, model) + attention_mask = torch.ones(input_bs, input_len + 1) + attention_mask[:,0] = 0 + example_inputs = (input_ids, tuple(past_key_values), attention_mask) + # do inference to check example_inputs formats + model(*example_inputs) + if return_type != "tuple": + example_inputs = { + "input_ids": input_ids, + "past_key_values": tuple(past_key_values), + "attention_mask": attention_mask + } + return example_inputs \ No newline at end of file From 2dab42133be3569d745802eaeef45dafd9d10b8e Mon Sep 17 00:00:00 2001 From: changwangss Date: Fri, 15 Sep 2023 01:37:42 -0700 Subject: [PATCH 05/19] remove the outdated code Signed-off-by: changwangss --- .../llm/quantization/optimization.py | 152 +----------------- .../neural_chat/config.py | 8 +- .../transformers/utils/quantization_config.py | 43 +++++ 3 files changed, 47 insertions(+), 156 deletions(-) create mode 100644 intel_extension_for_transformers/transformers/utils/quantization_config.py diff --git a/intel_extension_for_transformers/llm/quantization/optimization.py b/intel_extension_for_transformers/llm/quantization/optimization.py index 07796efa16c..5db84390df2 100644 --- a/intel_extension_for_transformers/llm/quantization/optimization.py +++ b/intel_extension_for_transformers/llm/quantization/optimization.py @@ -15,32 +15,21 @@ # See the License for the specific language governing permissions and # limitations under the License. -from doctest import Example from typing import Union -from venv import logger -from intel_extension_for_transformers.transformers.utils.utility import LazyImport from intel_extension_for_transformers.neural_chat.config import ( AMPConfig, WeightOnlyQuantizationConfig, - BitsAndBytesConfig, - SmoothQuantConfig + BitsAndBytesConfig ) -import logging -import warnings -logger = logging.getLogger(__name__) -torch = LazyImport("torch") class Optimization: def __init__( self, - optimization_config: Union[AMPConfig, WeightOnlyQuantizationConfig, BitsAndBytesConfig, SmoothQuantConfig] + optimization_config: Union[AMPConfig, WeightOnlyQuantizationConfig, BitsAndBytesConfig] ): self.optimization_config = optimization_config - def optimize(self, model, tokenizer=None, calib_func=None): - """ - Optimize the model with a given config. - """ + def optimize(self, model): optimized_model = model config = self.optimization_config if isinstance(config, WeightOnlyQuantizationConfig): @@ -66,139 +55,4 @@ def optimize(self, model, tokenizer=None, calib_func=None): model, conf, ).model - elif isinstance(config, SmoothQuantConfig): - print("Applying SmoothQuant.") - try: - import intel_extension_for_pytorch as ipex - except ImportError: - warnings.warn( - "Please install Intel Extension for PyTorch to accelerate the model inference." - ) - if tokenizer is None: - logger.error("Please provide the tokenizer or provide calib_func directly," + - " the following is how to get tokenizer. \n" + - " from transformer import AutoTokenizer \n" + - " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n" - ) - exit(0) - if calib_func is None: - from datasets import load_dataset - from torch.utils.data import DataLoader - calib_dataset = load_dataset("NeelNanda/pile-10k", split="train") - calib_dataset = calib_dataset.shuffle(seed=42) - - def tokenize_function(examples): - return tokenizer(examples["text"]) - - tokenized_dataset = calib_dataset.map(tokenize_function, batched=True) - tokenized_dataset.set_format(type="torch", columns=["input_ids"]) - - def collate_batch(batch): - input_ids_padded = [] - for text in batch: - input_ids = text["input_ids"] - input_ids = ( - input_ids[: 512] - if len(input_ids) > 512 - else input_ids - ) - input_ids_padded.append(input_ids) - return (torch.vstack(input_ids_padded)) - calib_dataloader = DataLoader( - tokenized_dataset, - batch_size=1, - shuffle=False, - collate_fn=collate_batch, - ) - def default_calib_func(model): - """ - This is the default calibration function, the dataset is NeelNanda/pile-10k, - the default calib_iters is 100. - """ - - for i, (input_ids) in enumerate(calib_dataloader): - input_bs, input_len = input_ids.shape - past_key_values = self.generate_dummy_past_key_values(input_bs, model) - attention_mask = torch.ones(input_bs, input_len + 1) - attention_mask[:,0] = 0 - if i >= 100: - break - model( - input_ids=input_ids, - past_key_values=past_key_values, - attention_mask=attention_mask, - ) - recipes = {"smooth_quant": True, "smooth_quant_args": {"alpha": config.alpha}} - example_inputs = self.get_example_inputs_for_trace(model) - from neural_compressor import PostTrainingQuantConfig, quantization - conf = PostTrainingQuantConfig( - backend="ipex", - excluded_precisions=config.excluded_precisions, - op_type_dict=config.op_type_dict, - recipes=recipes, - example_inputs=example_inputs, - ) - if calib_func is None: - logger.info("The default calibration funcation is used, " + - "the calibration dataset is NeelNanda/pile-10k," + - "batchsize is 1 and calibration iteration is 100.") - calib_func = default_calib_func - else: - calib_func = calib_func - optimized_model = quantization.fit( - model, - conf, - calib_func=calib_func, - ) return optimized_model - - def generate_dummy_past_key_values(self, input_bs, model): - """ - Generate the dummy past_key_values. - """ - from optimum.utils import NormalizedConfigManager - normalized_config = NormalizedConfigManager.get_normalized_config_class( - model.config.model_type - )(model.config) - nb_pkv = 2 - num_layers = normalized_config.num_layers - num_attention_heads = normalized_config.num_attention_heads - hidden_size = normalized_config.hidden_size - d_k = hidden_size // num_attention_heads - - if model.config.model_type == "bloom": - pkv = () - for nb_pkv in range(nb_pkv): - if nb_pkv % 2 == 0: - new_shape = [input_bs * num_attention_heads, d_k, 1] - else: - new_shape = [input_bs * num_attention_heads, 1, d_k] - pkv = pkv + (torch.ones(size=new_shape),) - else: - new_shape = [input_bs, num_attention_heads, 1, d_k] - dummy_tensor = torch.ones(size=new_shape) - pkv = tuple(dummy_tensor for _ in range(nb_pkv)) - past_key_values = tuple(tuple(pkv) for _ in range(num_layers)) - return past_key_values - - def get_example_inputs_for_trace(self, model, return_type="tuple"): - """ - Generate the example_input for tracing, support models load from AutoModelForCausalLM. - - """ - input_ids = model.dummy_inputs["input_ids"] - input_bs, input_len = input_ids.shape - past_key_values = self.generate_dummy_past_key_values(input_bs, model) - attention_mask = torch.ones(input_bs, input_len + 1) - attention_mask[:,0] = 0 - example_inputs = (input_ids, tuple(past_key_values), attention_mask) - # do inference to check example_inputs formats - model(*example_inputs) - if return_type != "tuple": - example_inputs = { - "input_ids": input_ids, - "past_key_values": tuple(past_key_values), - "attention_mask": attention_mask - } - return example_inputs - diff --git a/intel_extension_for_transformers/neural_chat/config.py b/intel_extension_for_transformers/neural_chat/config.py index 13e0c3494db..7248dc0bcb7 100644 --- a/intel_extension_for_transformers/neural_chat/config.py +++ b/intel_extension_for_transformers/neural_chat/config.py @@ -416,13 +416,7 @@ class WeightOnlyQuantizationConfig: @dataclass class AMPConfig: - dtype: str = 'bfloat16' - -@dataclass -class SmoothQuantConfig: - alpha: float = 0.5 - op_type_dict: dict = None - excluded_precisions: dict = None + dtype: str = 'bfloat16' class PipelineConfig: def __init__(self, diff --git a/intel_extension_for_transformers/transformers/utils/quantization_config.py b/intel_extension_for_transformers/transformers/utils/quantization_config.py new file mode 100644 index 00000000000..512322825dc --- /dev/null +++ b/intel_extension_for_transformers/transformers/utils/quantization_config.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Configs for intel extension for transformers.""" + +from dataclasses import dataclass +from typing import Optional, Any +from transformers import BitsAndBytesConfig + + +@dataclass +class WeightOnlyQuantizationConfig: + algorithm: str = 'RTN' + bits: int = 8 + group_size: int = -1 + scheme: str = 'sym' + enable_full_range: bool = True + +@dataclass +class AMPConfig: + dtype: str = 'bfloat16' + +@dataclass +class SmoothQuantConfig: + tokenizer: Any = None + calib_func: Any = None + alpha: float = 0.5 + op_type_dict: dict = None + excluded_precisions: dict = None + From 7ba2aede6a5b3688231dc6268a14195e6d91cb37 Mon Sep 17 00:00:00 2001 From: "Wang, Chang" Date: Fri, 15 Sep 2023 16:46:26 +0800 Subject: [PATCH 06/19] update order --- intel_extension_for_transformers/transformers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/intel_extension_for_transformers/transformers/__init__.py b/intel_extension_for_transformers/transformers/__init__.py index fbec8b11cb7..8cf610f1015 100644 --- a/intel_extension_for_transformers/transformers/__init__.py +++ b/intel_extension_for_transformers/transformers/__init__.py @@ -23,7 +23,6 @@ from .distillation import (SUPPORTED_DISTILLATION_CRITERION_MODE, DistillationCriterionMode) from .mixture.auto_distillation import AutoDistillation -from .modeling import AutoModelForCausalLM, OptimizedModel from .nas import NAS from .optimizer import NoTrainerOptimizer, Orchestrate_optimizer from .optimizer_tf import TFOptimization @@ -32,3 +31,4 @@ from .utils import (AMPConfig, BitsAndBytesConfig, SmoothQuantConfig, WeightOnlyQuantizationConfig, metrics, objectives) from .utils.utility import LazyImport +from .modeling import AutoModelForCausalLM, OptimizedModel From 205f8eccf248b57e1d6dfd63e371f7b0c05bc459 Mon Sep 17 00:00:00 2001 From: changwangss Date: Fri, 15 Sep 2023 04:04:44 -0700 Subject: [PATCH 07/19] improve sqconfig and add ut Signed-off-by: changwangss --- .../transformers/modeling/modeling_causal.py | 18 ++++++-- .../transformers/utils/quantization_config.py | 19 ++++---- tests/test_quantization.py | 46 +++++++++++++++++++ 3 files changed, 72 insertions(+), 11 deletions(-) diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_causal.py b/intel_extension_for_transformers/transformers/modeling/modeling_causal.py index e090d092ad2..92686191a28 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_causal.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_causal.py @@ -176,11 +176,23 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): if calib_func is None: from datasets import load_dataset from torch.utils.data import DataLoader - calib_dataset = load_dataset("NeelNanda/pile-10k", split="train") + calib_dataset = quantization_config.calib_dataset + calib_iters = quantization_config.calib_iters + calib_dataset = load_dataset(calib_dataset, split="train") calib_dataset = calib_dataset.shuffle(seed=42) def tokenize_function(examples): - return quantization_config.tokenizer(examples["text"]) + if 'prompt' in examples: + example = quantization_config.tokenizer(examples["prompt"]) + elif 'text' in examples: + example = quantization_config.tokenizer(examples["text"]) + elif 'code' in examples: + example = quantization_config.tokenizer(examples["code"]) + else: + logger.error("Please check dataset prompt identifier," + + " NeelNanda/pile-10k is default used calibration dataset.") + exit(0) + return example tokenized_dataset = calib_dataset.map(tokenize_function, batched=True) tokenized_dataset.set_format(type="torch", columns=["input_ids"]) @@ -213,7 +225,7 @@ def default_calib_func(model): past_key_values = generate_dummy_past_key_values(input_bs, model) attention_mask = torch.ones(input_bs, input_len + 1) attention_mask[:,0] = 0 - if i >= 100: + if i >= calib_iters: break model( input_ids=input_ids, diff --git a/intel_extension_for_transformers/transformers/utils/quantization_config.py b/intel_extension_for_transformers/transformers/utils/quantization_config.py index 512322825dc..1c590609055 100644 --- a/intel_extension_for_transformers/transformers/utils/quantization_config.py +++ b/intel_extension_for_transformers/transformers/utils/quantization_config.py @@ -16,28 +16,31 @@ # limitations under the License. """Configs for intel extension for transformers.""" -from dataclasses import dataclass -from typing import Optional, Any +from dataclasses import dataclass, field +from typing import Any, Optional + from transformers import BitsAndBytesConfig @dataclass class WeightOnlyQuantizationConfig: - algorithm: str = 'RTN' + algorithm: str = "RTN" bits: int = 8 group_size: int = -1 - scheme: str = 'sym' + scheme: str = "sym" enable_full_range: bool = True + @dataclass class AMPConfig: - dtype: str = 'bfloat16' + dtype: str = "bfloat16" @dataclass class SmoothQuantConfig: tokenizer: Any = None calib_func: Any = None + calib_dataset: str = "NeelNanda/pile-10k" + calib_iters: int = 100 alpha: float = 0.5 - op_type_dict: dict = None - excluded_precisions: dict = None - + op_type_dict: dict = None + excluded_precisions: list = field(default_factory=list) diff --git a/tests/test_quantization.py b/tests/test_quantization.py index 08aa2e504bc..49031214936 100644 --- a/tests/test_quantization.py +++ b/tests/test_quantization.py @@ -286,6 +286,52 @@ def test_bf16_onnx(self): self.assertEqual(tensor.data_type, TensorProto.BFLOAT16) break + def test_quantization_for_llm(self): + model_name_or_path = "facebook/opt-125m" + tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) + from intel_extension_for_transformers.transformers import ( + AMPConfig, + WeightOnlyQuantizationConfig, + SmoothQuantConfig, + BitsAndBytesConfig + + ) + from intel_extension_for_transformers.transformers import AutoModelForCausalLM + fp32_model = AutoModelForCausalLM.from_pretrained(model_name_or_path) + dummy_input = fp32_model.dummy_inputs["input_ids"] + + # smooth-quant + sq_config = SmoothQuantConfig( + tokenizer=tokenizer, # either two of one, tokenizer or calib_func + calib_iters=5 + ) + q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + quantization_config=sq_config + ) + self.assertTrue(isinstance(q_model, torch.jit.ScriptModule)) + # weight-only + woq_config = WeightOnlyQuantizationConfig() + woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + quantization_config=woq_config + ) + output = woq_model(dummy_input) + self.assertTrue(float(output[0][0][0][0]), -7.139640808105469) + # amp + amp_config = AMPConfig() + amp_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + quantization_config=amp_config + ) + output = amp_model(dummy_input) + self.assertTrue(float(output[0][0][0][0]), -7.347761154174805) + + + # bitsandbytes + bab_config = BitsAndBytesConfig() + bab_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + quantization_config=bab_config + ) + output = bab_model(dummy_input) + self.assertTrue(float(output[0][0][0][0]), -7.347761154174805) if __name__ == "__main__": unittest.main() From 3b234777ed83c59924870a267fcbfc313cc23dbe Mon Sep 17 00:00:00 2001 From: changwangss Date: Sun, 17 Sep 2023 23:11:28 -0700 Subject: [PATCH 08/19] refine woq Signed-off-by: changwangss --- .../llm/quantization/utils.py | 2 +- .../transformers/__init__.py | 41 ++- .../transformers/modeling/__init__.py | 3 +- .../transformers/modeling/modeling_auto.py | 327 ++++++++++++++---- .../transformers/modeling/modeling_causal.py | 262 -------------- .../transformers/utils/__init__.py | 2 +- .../transformers/utils/quantization_config.py | 203 ++++++++++- tests/test_quantization.py | 20 +- 8 files changed, 492 insertions(+), 368 deletions(-) delete mode 100644 intel_extension_for_transformers/transformers/modeling/modeling_causal.py diff --git a/intel_extension_for_transformers/llm/quantization/utils.py b/intel_extension_for_transformers/llm/quantization/utils.py index 31f86efef90..f20105c8a55 100644 --- a/intel_extension_for_transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/llm/quantization/utils.py @@ -184,7 +184,7 @@ def convert_to_quantized_model(model, config): "dtype":dtype, "group_size": config.group_size, # -1 (per-channel) "scheme": config.scheme, - "algorithm": "RTN", + "algorithm": config.algorithm, }, }, }, diff --git a/intel_extension_for_transformers/transformers/__init__.py b/intel_extension_for_transformers/transformers/__init__.py index 8cf610f1015..18ce4ef6e87 100644 --- a/intel_extension_for_transformers/transformers/__init__.py +++ b/intel_extension_for_transformers/transformers/__init__.py @@ -16,19 +16,42 @@ # limitations under the License. -from .config import (WEIGHTS_NAME, AutoDistillationConfig, BenchmarkConfig, - DistillationConfig, DynamicLengthConfig, - FlashDistillationConfig, NASConfig, Provider, PrunerV2, - PruningConfig, QuantizationConfig, TFDistillationConfig) -from .distillation import (SUPPORTED_DISTILLATION_CRITERION_MODE, - DistillationCriterionMode) +from .config import ( + WEIGHTS_NAME, + AutoDistillationConfig, + BenchmarkConfig, + DistillationConfig, + DynamicLengthConfig, + FlashDistillationConfig, + NASConfig, + Provider, + PrunerV2, + PruningConfig, + QuantizationConfig, + TFDistillationConfig, +) +from .distillation import ( + SUPPORTED_DISTILLATION_CRITERION_MODE, + DistillationCriterionMode, +) from .mixture.auto_distillation import AutoDistillation from .nas import NAS from .optimizer import NoTrainerOptimizer, Orchestrate_optimizer from .optimizer_tf import TFOptimization from .pruning import SUPPORTED_PRUNING_MODE, PrunerConfig, PruningMode from .quantization import SUPPORTED_QUANT_MODE, QuantizationMode -from .utils import (AMPConfig, BitsAndBytesConfig, SmoothQuantConfig, - WeightOnlyQuantizationConfig, metrics, objectives) +from .utils import ( + AMPConfig, + BitsAndBytesConfig, + SmoothQuantConfig, + WeightOnlyQuantConfig, + metrics, + objectives, +) from .utils.utility import LazyImport -from .modeling import AutoModelForCausalLM, OptimizedModel +from .modeling import ( + AutoModelForCausalLM, + AutoModel, + AutoModelForSeq2SeqLM, + OptimizedModel, +) diff --git a/intel_extension_for_transformers/transformers/modeling/__init__.py b/intel_extension_for_transformers/transformers/modeling/__init__.py index e8353a5ea25..5fb99065b03 100644 --- a/intel_extension_for_transformers/transformers/modeling/__init__.py +++ b/intel_extension_for_transformers/transformers/modeling/__init__.py @@ -19,4 +19,5 @@ from .model import OptimizedModel -from .modeling_auto import AutoModelForCausalLM +from .modeling_auto import (AutoModel, AutoModelForCausalLM, + AutoModelForSeq2SeqLM) diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index 92e23893449..517c91371fc 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -1,75 +1,252 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import logging -import torch -import transformers - - -logger = logging.getLogger(__name__) - - -class _BaseQBitsAutoModelClass: - ORIG_MODEL = None - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): - import intel_extension_for_transformers.transformers.modeling.modeling_map - load_in_8bit = kwargs.pop("load_in_8bit", False) - load_in_4bit = kwargs.pop("load_in_4bit", False) - quantization_config = kwargs.pop("quantization_config", None) - if load_in_8bit or load_in_4bit or quantization_config is not None: - from ...llm.quantization.config import WeightOnlyConfig - from ...llm.quantization.utils import convert_to_quantized_model, convert_dtype_2_str - torch_dtype = kwargs.pop("torch_dtype", torch.float32) - if load_in_4bit: - if quantization_config is None: - quantization_config = WeightOnlyConfig(compute_dtype=torch_dtype, weight_dtype="nf4") - else: - assert "4" in quantization_config.weight_dtype and quantization_config.compute_dtype == torch_dtype, \ - f"Quantization_config.weight_dtype should be 'nf4', 'int4_fullrange', 'int4_clip'," - f"'fp4_e2m1' or 'fp4_e2m1_bnb' and compute_dtype should be {torch_dtype}." - elif load_in_8bit: - if quantization_config is None: - quantization_config = WeightOnlyConfig(compute_dtype=torch_dtype, weight_dtype="int8") - else: - assert quantization_config.weight_dtype == "int8" \ - and quantization_config.compute_dtype == torch_dtype, \ - f"Quantization_config.weight_dtype should be 'int8' and compute_dtype should be {torch_dtype}." - elif quantization_config is not None: - if quantization_config.compute_dtype != convert_dtype_2_str(torch_dtype): - logger.warning(f"Quantization_config.compute_dtype should be align with {torch_dtype}.") - - model = cls.ORIG_MODEL.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - if quantization_config is not None: - return convert_to_quantized_model(model, quantization_config) - else: - return model - - -class AutoModelForCausalLM(_BaseQBitsAutoModelClass): - ORIG_MODEL = transformers.AutoModelForCausalLM - - -class AutoModel(_BaseQBitsAutoModelClass): - ORIG_MODEL = transformers.AutoModel - - -class AutoModelForSeq2SeqLM(_BaseQBitsAutoModelClass): - ORIG_MODEL = transformers.AutoModelForSeq2SeqLM - +# !/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# coding=utf-8 +# Copyright 2021 The EleutherAI and HuggingFace Teams. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy + +from transformers import AutoConfig, PretrainedConfig +from transformers.dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code +from transformers.models.auto.modeling_auto import (MODEL_FOR_CAUSAL_LM_MAPPING, + MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, + MODEL_MAPPING + ) + +from transformers.models.auto.auto_factory import _get_model_class +from intel_extension_for_transformers.transformers.utils.utility import ( + LazyImport, + generate_dummy_past_key_values, + get_example_inputs_for_trace +) + + +from intel_extension_for_transformers.transformers import ( + AMPConfig, + WeightOnlyQuantConfig, + SmoothQuantConfig +) +import logging +import warnings +logger = logging.getLogger(__name__) +torch = LazyImport("torch") + + +class _BaseAutoModelClass: + # Base class for auto models. + _model_mapping = None + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + import intel_extension_for_transformers.transformers.modeling.modeling_map + config = kwargs.pop("config", None) + calib_func = kwargs.pop("calib_func", None) + trust_remote_code = kwargs.pop("trust_remote_code", None) + kwargs["_from_auto"] = True + hub_kwargs_names = [ + "cache_dir", + "code_revision", + "force_download", + "local_files_only", + "proxies", + "resume_download", + "revision", + "subfolder", + "use_auth_token", + ] + hub_kwargs = {name: kwargs.pop(name) for name in hub_kwargs_names if name in kwargs} + + if not isinstance(config, PretrainedConfig): + kwargs_orig = copy.deepcopy(kwargs) + # ensure not to pollute the config object with torch_dtype="auto" - since it's + # meaningless in the context of the config object - torch.dtype values are acceptable + if kwargs.get("torch_dtype", None) == "auto": + _ = kwargs.pop("torch_dtype") + # to not overwrite the quantization_config if config has a quantization_config + + if kwargs.get("quantization_config", None) is not None: + _ = kwargs.pop("quantization_config") + + config, kwargs = AutoConfig.from_pretrained( + pretrained_model_name_or_path, + return_unused_kwargs=True, + trust_remote_code=trust_remote_code, + **hub_kwargs, + **kwargs, + ) + + # if torch_dtype=auto was passed here, ensure to pass it on + if kwargs_orig.get("torch_dtype", None) == "auto": + kwargs["torch_dtype"] = "auto" + quantization_config = kwargs_orig.get("quantization_config", None) + if quantization_config is not None and not (isinstance(quantization_config, SmoothQuantConfig) or + isinstance(quantization_config, AMPConfig) or + isinstance(quantization_config, WeightOnlyQuantConfig) + ): + kwargs["quantization_config"] = kwargs_orig["quantization_config"] + if isinstance(quantization_config, AMPConfig): + config.torch_dtype=torch.bfloat16 + + has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map + has_local_code = type(config) in cls._model_mapping.keys() + trust_remote_code = resolve_trust_remote_code( + trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code + ) + if has_remote_code and trust_remote_code: + class_ref = config.auto_map[cls.__name__] + model_class = get_class_from_dynamic_module( + class_ref, pretrained_model_name_or_path, **hub_kwargs, **kwargs + ) + _ = hub_kwargs.pop("code_revision", None) + model = model_class.from_pretrained( + pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs + ) + elif type(config) in cls._model_mapping.keys(): + model_class = _get_model_class(config, cls._model_mapping) + model = model_class.from_pretrained( + pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs + ) + else: + raise ValueError( + f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n" + f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}." + ) + model.eval() + if isinstance(quantization_config, WeightOnlyQuantConfig): + logger.info("Applying Weight Only Quantization.") + from intel_extension_for_transformers.llm.quantization.utils import convert_to_quantized_model + convert_to_quantized_model(model, quantization_config) + elif isinstance(quantization_config, SmoothQuantConfig): + logger.info("Applying SmoothQuant.") + try: + import intel_extension_for_pytorch as ipex + except ImportError: + warnings.warn( + "Please install Intel Extension for PyTorch to accelerate the model inference." + ) + if quantization_config.tokenizer is None: + logger.error("Please provide the tokenizer or provide calib_func directly," + + " the following is how to get tokenizer. \n" + + " from transformer import AutoTokenizer \n" + + " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n" + ) + exit(0) + if calib_func is None: + from datasets import load_dataset + from torch.utils.data import DataLoader + calib_dataset = quantization_config.calib_dataset + calib_iters = quantization_config.calib_iters + calib_dataset = load_dataset(calib_dataset, split="train") + calib_dataset = calib_dataset.shuffle(seed=42) + + def tokenize_function(examples): + if 'prompt' in examples: + example = quantization_config.tokenizer(examples["prompt"]) + elif 'text' in examples: + example = quantization_config.tokenizer(examples["text"]) + elif 'code' in examples: + example = quantization_config.tokenizer(examples["code"]) + else: + logger.error("Please check dataset prompt identifier," + + " NeelNanda/pile-10k is default used calibration dataset.") + exit(0) + return example + + tokenized_dataset = calib_dataset.map(tokenize_function, batched=True) + tokenized_dataset.set_format(type="torch", columns=["input_ids"]) + + def collate_batch(batch): + input_ids_padded = [] + for text in batch: + input_ids = text["input_ids"] + input_ids = ( + input_ids[: 512] + if len(input_ids) > 512 + else input_ids + ) + input_ids_padded.append(input_ids) + return (torch.vstack(input_ids_padded)) + calib_dataloader = DataLoader( + tokenized_dataset, + batch_size=1, + shuffle=False, + collate_fn=collate_batch, + ) + def default_calib_func(model): + """ + This is the default calibration function, the dataset is NeelNanda/pile-10k, + the default calib_iters is 100. + """ + + for i, (input_ids) in enumerate(calib_dataloader): + input_bs, input_len = input_ids.shape + past_key_values = generate_dummy_past_key_values(input_bs, model) + attention_mask = torch.ones(input_bs, input_len + 1) + attention_mask[:,0] = 0 + if i >= calib_iters: + break + model( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + ) + recipes = {"smooth_quant": True, "smooth_quant_args": {"alpha": quantization_config.alpha}} + example_inputs = get_example_inputs_for_trace(model) + from neural_compressor import PostTrainingQuantConfig, quantization + conf = PostTrainingQuantConfig( + backend="ipex", + excluded_precisions=quantization_config.excluded_precisions, + op_type_dict=quantization_config.op_type_dict, + recipes=recipes, + example_inputs=example_inputs, + ) + if calib_func is None: + logger.info("The default calibration funcation is used, " + + "the calibration dataset is NeelNanda/pile-10k," + + "batchsize is 1 and calibration iteration is 100.") + calib_func = default_calib_func + else: + calib_func = calib_func + model.config.torchscript = True + model = quantization.fit( + model, + conf, + calib_func=calib_func + ).model + return model + + +class AutoModelForCausalLM(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_CAUSAL_LM_MAPPING + +class AutoModel(_BaseAutoModelClass): + _model_mapping = MODEL_MAPPING + +class AutoModelForSeq2SeqLM(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_causal.py b/intel_extension_for_transformers/transformers/modeling/modeling_causal.py deleted file mode 100644 index 92686191a28..00000000000 --- a/intel_extension_for_transformers/transformers/modeling/modeling_causal.py +++ /dev/null @@ -1,262 +0,0 @@ -# !/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# coding=utf-8 -# Copyright 2021 The EleutherAI and HuggingFace Teams. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy - -from transformers import AutoConfig, PretrainedConfig -from transformers.dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code -from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING -from transformers.models.auto.auto_factory import _get_model_class -from intel_extension_for_transformers.transformers.utils.utility import ( - LazyImport, - generate_dummy_past_key_values, - get_example_inputs_for_trace -) - - -from intel_extension_for_transformers.transformers import ( - AMPConfig, - WeightOnlyQuantizationConfig, - SmoothQuantConfig -) -import logging -import warnings -logger = logging.getLogger(__name__) -torch = LazyImport("torch") - - -class _BaseAutoModelClass: - # Base class for auto models. - _model_mapping = None - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): - import intel_extension_for_transformers.transformers.modeling.modeling_map - config = kwargs.pop("config", None) - calib_func = kwargs.pop("calib_func", None) - trust_remote_code = kwargs.pop("trust_remote_code", None) - kwargs["_from_auto"] = True - hub_kwargs_names = [ - "cache_dir", - "code_revision", - "force_download", - "local_files_only", - "proxies", - "resume_download", - "revision", - "subfolder", - "use_auth_token", - ] - hub_kwargs = {name: kwargs.pop(name) for name in hub_kwargs_names if name in kwargs} - - if not isinstance(config, PretrainedConfig): - kwargs_orig = copy.deepcopy(kwargs) - # ensure not to pollute the config object with torch_dtype="auto" - since it's - # meaningless in the context of the config object - torch.dtype values are acceptable - if kwargs.get("torch_dtype", None) == "auto": - _ = kwargs.pop("torch_dtype") - # to not overwrite the quantization_config if config has a quantization_config - - if kwargs.get("quantization_config", None) is not None: - _ = kwargs.pop("quantization_config") - - config, kwargs = AutoConfig.from_pretrained( - pretrained_model_name_or_path, - return_unused_kwargs=True, - trust_remote_code=trust_remote_code, - **hub_kwargs, - **kwargs, - ) - - # if torch_dtype=auto was passed here, ensure to pass it on - if kwargs_orig.get("torch_dtype", None) == "auto": - kwargs["torch_dtype"] = "auto" - quantization_config = kwargs_orig.get("quantization_config", None) - if quantization_config is not None and not (isinstance(quantization_config, SmoothQuantConfig) or - isinstance(quantization_config, AMPConfig) or - isinstance(quantization_config, WeightOnlyQuantizationConfig) - ): - kwargs["quantization_config"] = kwargs_orig["quantization_config"] - if isinstance(quantization_config, AMPConfig): - config.torch_dtype=torch.bfloat16 - - has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map - has_local_code = type(config) in cls._model_mapping.keys() - trust_remote_code = resolve_trust_remote_code( - trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code - ) - if has_remote_code and trust_remote_code: - class_ref = config.auto_map[cls.__name__] - model_class = get_class_from_dynamic_module( - class_ref, pretrained_model_name_or_path, **hub_kwargs, **kwargs - ) - _ = hub_kwargs.pop("code_revision", None) - model = model_class.from_pretrained( - pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs - ) - elif type(config) in cls._model_mapping.keys(): - model_class = _get_model_class(config, cls._model_mapping) - model = model_class.from_pretrained( - pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs - ) - else: - raise ValueError( - f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n" - f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}." - ) - model.eval() - if isinstance(quantization_config, WeightOnlyQuantizationConfig): - logger.info("Applying Weight Only Quantization.") - from neural_compressor import PostTrainingQuantConfig, quantization - op_type_dict = { - '.*':{ # re.match - "weight": { - 'bits': quantization_config.bits, # 1-8 bits - 'group_size': quantization_config.group_size, # -1 (per-channel) - 'scheme': quantization_config.scheme, # sym/asym - 'algorithm': quantization_config.algorithm, # RTN/AWQ/TEQ - }, - }, - } - recipes = {"rtn_args": {"enable_full_range": quantization_config.enable_full_range}} - conf = PostTrainingQuantConfig( - approach='weight_only', - op_type_dict=op_type_dict, - recipes=recipes, - ) - model.config.torchscript = True - model = quantization.fit( - model, - conf, - ).model - elif isinstance(quantization_config, SmoothQuantConfig): - logger.info("Applying SmoothQuant.") - try: - import intel_extension_for_pytorch as ipex - except ImportError: - warnings.warn( - "Please install Intel Extension for PyTorch to accelerate the model inference." - ) - if quantization_config.tokenizer is None: - logger.error("Please provide the tokenizer or provide calib_func directly," + - " the following is how to get tokenizer. \n" + - " from transformer import AutoTokenizer \n" + - " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n" - ) - exit(0) - if calib_func is None: - from datasets import load_dataset - from torch.utils.data import DataLoader - calib_dataset = quantization_config.calib_dataset - calib_iters = quantization_config.calib_iters - calib_dataset = load_dataset(calib_dataset, split="train") - calib_dataset = calib_dataset.shuffle(seed=42) - - def tokenize_function(examples): - if 'prompt' in examples: - example = quantization_config.tokenizer(examples["prompt"]) - elif 'text' in examples: - example = quantization_config.tokenizer(examples["text"]) - elif 'code' in examples: - example = quantization_config.tokenizer(examples["code"]) - else: - logger.error("Please check dataset prompt identifier," + - " NeelNanda/pile-10k is default used calibration dataset.") - exit(0) - return example - - tokenized_dataset = calib_dataset.map(tokenize_function, batched=True) - tokenized_dataset.set_format(type="torch", columns=["input_ids"]) - - def collate_batch(batch): - input_ids_padded = [] - for text in batch: - input_ids = text["input_ids"] - input_ids = ( - input_ids[: 512] - if len(input_ids) > 512 - else input_ids - ) - input_ids_padded.append(input_ids) - return (torch.vstack(input_ids_padded)) - calib_dataloader = DataLoader( - tokenized_dataset, - batch_size=1, - shuffle=False, - collate_fn=collate_batch, - ) - def default_calib_func(model): - """ - This is the default calibration function, the dataset is NeelNanda/pile-10k, - the default calib_iters is 100. - """ - - for i, (input_ids) in enumerate(calib_dataloader): - input_bs, input_len = input_ids.shape - past_key_values = generate_dummy_past_key_values(input_bs, model) - attention_mask = torch.ones(input_bs, input_len + 1) - attention_mask[:,0] = 0 - if i >= calib_iters: - break - model( - input_ids=input_ids, - past_key_values=past_key_values, - attention_mask=attention_mask, - ) - recipes = {"smooth_quant": True, "smooth_quant_args": {"alpha": quantization_config.alpha}} - example_inputs = get_example_inputs_for_trace(model) - from neural_compressor import PostTrainingQuantConfig, quantization - conf = PostTrainingQuantConfig( - backend="ipex", - excluded_precisions=quantization_config.excluded_precisions, - op_type_dict=quantization_config.op_type_dict, - recipes=recipes, - example_inputs=example_inputs, - ) - if calib_func is None: - logger.info("The default calibration funcation is used, " + - "the calibration dataset is NeelNanda/pile-10k," + - "batchsize is 1 and calibration iteration is 100.") - calib_func = default_calib_func - else: - calib_func = calib_func - model.config.torchscript = True - model = quantization.fit( - model, - conf, - calib_func=calib_func - ).model - return model - - -class AutoModelForCausalLM(_BaseAutoModelClass): - _model_mapping = MODEL_FOR_CAUSAL_LM_MAPPING \ No newline at end of file diff --git a/intel_extension_for_transformers/transformers/utils/__init__.py b/intel_extension_for_transformers/transformers/utils/__init__.py index 1b574a0d3bf..c4eae8a076a 100644 --- a/intel_extension_for_transformers/transformers/utils/__init__.py +++ b/intel_extension_for_transformers/transformers/utils/__init__.py @@ -21,5 +21,5 @@ AMPConfig, BitsAndBytesConfig, SmoothQuantConfig, - WeightOnlyQuantizationConfig, + WeightOnlyQuantConfig, ) diff --git a/intel_extension_for_transformers/transformers/utils/quantization_config.py b/intel_extension_for_transformers/transformers/utils/quantization_config.py index 1c590609055..9653c1a0dbf 100644 --- a/intel_extension_for_transformers/transformers/utils/quantization_config.py +++ b/intel_extension_for_transformers/transformers/utils/quantization_config.py @@ -16,19 +16,204 @@ # limitations under the License. """Configs for intel extension for transformers.""" +import copy +import json +import os from dataclasses import dataclass, field -from typing import Any, Optional - +from typing import Any, Optional, Dict, Union +from .utility import LazyImport from transformers import BitsAndBytesConfig +from intel_extension_for_transformers.llm.quantization.utils import convert_dtype_2_str +torch = LazyImport("torch") +class WeightOnlyQuantConfig: + def __init__( + self, + llm_int8_skip_modules=None, + compute_dtype=None, + weight_dtype="int4_fullrange", # int8 int4_clip, int4_fullrange fp4_e2m1_bnb fp4_e2m1 nf4 + scale_dtype="fp32", # Now only fp32 + mse_range=False, + use_double_quant=False, + double_quant_dtype="int8", # reserve for double quant + double_quant_scale_dtype="fp32", # reserve for double quant + group_size=None, + scheme="sym", + algorithm="RTN", + **kwargs, + ): + self.llm_int8_skip_modules = llm_int8_skip_modules if llm_int8_skip_modules else [] + self.weight_dtype = weight_dtype + self.scale_dtype = scale_dtype + self.mse_range = mse_range + self.use_double_quant = use_double_quant + self.double_quant_dtype = double_quant_dtype + self.double_quant_scale_dtype = double_quant_scale_dtype + self.scheme = scheme + self.algorithm = algorithm -@dataclass -class WeightOnlyQuantizationConfig: - algorithm: str = "RTN" - bits: int = 8 - group_size: int = -1 - scheme: str = "sym" - enable_full_range: bool = True + if group_size is None: + self.group_size = 32 + else: + self.group_size = group_size + if compute_dtype is None: + self.compute_dtype = "fp32" + elif isinstance(compute_dtype, str): + self.compute_dtype = compute_dtype + elif isinstance(compute_dtype, torch.dtype): + self.compute_dtype = convert_dtype_2_str(compute_dtype) + else: + raise ValueError("bit4_compute_dtype must be a string or a torch.dtype") + + self.post_init() + + def post_init(self): + r""" + Safety checker that arguments are correct - also replaces some NoneType arguments with their default values. + """ + + if self.llm_int8_skip_modules is not None and not isinstance(self.llm_int8_skip_modules, list): + raise ValueError("llm_int8_skip_modules must be a list of strings") + + if self.compute_dtype is not None and self.compute_dtype not in ['fp32', 'bf16', 'int8']: + raise ValueError("compute_dtype must be 'fp32', 'bf16', 'int8'.") + + if self.weight_dtype not in ['int8', 'int4_fullrange', 'int4_clip', 'nf4', 'fp4_e2m1_bnb', 'fp4_e2m1']: + raise ValueError(f"weight_dtype must be a string in " + f"'int8', 'int4_fullrange', 'int4_clip', 'nf4', 'fp4_e2m1_bnb', 'fp4_e2m1'") + + if self.scale_dtype not in ["fp32"]: + raise ValueError("scale_dtype must be a string in 'fp32'") + + if not isinstance(self.mse_range, bool): + raise ValueError("mse_range must be a boolean") + + if not isinstance(self.use_double_quant, bool): + raise ValueError("use_double_quant must be a boolean") + + if self.use_double_quant and not isinstance(self.double_quant_dtype, str): + raise ValueError("double_quant_dtype must be a string") + + if self.use_double_quant and not isinstance(self.double_quant_scale_dtype, str): + raise ValueError("double_quant_scale_dtype must be a string") + + if not isinstance(self.group_size, int): + raise ValueError("group_size must be a int") + + if not isinstance(self.scheme, str): + raise ValueError("scheme must be a string") + + def quantization_method(self): + r""" + This method returns the quantization method used for the model. + """ + if self.weight_dtype == 8: + return "s8" + elif self.weight_dtype == 4 and self.weight_dtype == "s4fullrange": + return "s4fullrange" + else: + raise ValueError("Only support int8 and int4 quantization now!") + + @classmethod + def from_dict(cls, config_dict, return_unused_kwargs, **kwargs): + """ + Instantiates a [`WeightOnlyConfig`] from a Python dictionary of parameters. + + Args: + config_dict (`Dict[str, Any]`): + Dictionary that will be used to instantiate the configuration object. + return_unused_kwargs (`bool`): + Whether or not to return a list of unused keyword arguments. Used for `from_pretrained` method in + `PreTrainedModel`. + kwargs (`Dict[str, Any]`): + Additional parameters from which to initialize the configuration object. + + Returns: + [`WeightOnlyConfig`]: The configuration object instantiated from those parameters. + """ + + config = cls(**config_dict) + + to_remove = [] + for key, value in kwargs.items(): + if hasattr(config, key): + setattr(config, key, value) + to_remove.append(key) + for key in to_remove: + kwargs.pop(key, None) + + if return_unused_kwargs: + return config, kwargs + else: + return config + + def to_json_file(self, json_file_path: Union[str, os.PathLike]): + """ + Save this instance to a JSON file. + + Args: + json_file_path (`str` or `os.PathLike`): + Path to the JSON file in which this configuration instance's parameters will be saved. + """ + with open(json_file_path, "w", encoding="utf-8") as writer: + config_dict = self.to_dict() + json_string = json.dumps(config_dict, indent=2, sort_keys=True) + "\n" + + writer.write(json_string) + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes this instance to a Python dictionary. Returns: + `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance. + """ + + output = copy.deepcopy(self.__dict__) + output["compute_dtype"] = str(output["compute_dtype"]).split(".")[1] + + return output + + def __repr__(self): + return f"{self.__class__.__name__} {self.to_json_string()}" + + def to_json_string(self, use_diff: bool = True) -> str: + """ + Serializes this instance to a JSON string. + + Args: + use_diff (`bool`, *optional*, defaults to `True`): + If set to `True`, only the difference between the config instance and the default `WeightOnlyConfig()` + is serialized to JSON string. + + Returns: + `str`: String containing all the attributes that make up this configuration instance in JSON format. + """ + if use_diff is True: + config_dict = self.to_diff_dict() + else: + config_dict = self.to_dict() + return json.dumps(config_dict, indent=2, sort_keys=True) + "\n" + + def to_diff_dict(self) -> Dict[str, Any]: + """ + Removes all attributes from config which correspond to the default config attributes for better readability and + serializes to a Python dictionary. + + Returns: + `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, + """ + config_dict = self.to_dict() + + # get the default config dict + default_config_dict = WeightOnlyConfig().to_dict() + + serializable_config_dict = {} + + # only serialize values that differ from the default config + for key, value in config_dict.items(): + if value != default_config_dict[key]: + serializable_config_dict[key] = value + + return serializable_config_dict @dataclass diff --git a/tests/test_quantization.py b/tests/test_quantization.py index 49031214936..140fbfc087a 100644 --- a/tests/test_quantization.py +++ b/tests/test_quantization.py @@ -291,7 +291,7 @@ def test_quantization_for_llm(self): tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) from intel_extension_for_transformers.transformers import ( AMPConfig, - WeightOnlyQuantizationConfig, + WeightOnlyQuantConfig, SmoothQuantConfig, BitsAndBytesConfig @@ -301,16 +301,16 @@ def test_quantization_for_llm(self): dummy_input = fp32_model.dummy_inputs["input_ids"] # smooth-quant - sq_config = SmoothQuantConfig( - tokenizer=tokenizer, # either two of one, tokenizer or calib_func - calib_iters=5 - ) - q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - quantization_config=sq_config - ) - self.assertTrue(isinstance(q_model, torch.jit.ScriptModule)) + #sq_config = SmoothQuantConfig( + # tokenizer=tokenizer, # either two of one, tokenizer or calib_func + # calib_iters=5 + # ) + #q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + # quantization_config=sq_config + # ) + #self.assertTrue(isinstance(q_model, torch.jit.ScriptModule)) # weight-only - woq_config = WeightOnlyQuantizationConfig() + woq_config = WeightOnlyQuantConfig() woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, quantization_config=woq_config ) From 22204ee4b7d8c7e93e08a4b414be6f5419e1a68e Mon Sep 17 00:00:00 2001 From: changwangss Date: Sun, 17 Sep 2023 23:37:40 -0700 Subject: [PATCH 09/19] fix mp name Signed-off-by: changwangss --- .../transformers/modeling/modeling_auto.py | 6 +++--- .../transformers/utils/__init__.py | 2 +- .../transformers/utils/quantization_config.py | 2 +- tests/test_quantization.py | 20 +++++++++---------- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index 517c91371fc..d5ad2705a7e 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -48,7 +48,7 @@ from intel_extension_for_transformers.transformers import ( - AMPConfig, + MixedPrecisionConfig, WeightOnlyQuantConfig, SmoothQuantConfig ) @@ -106,11 +106,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): kwargs["torch_dtype"] = "auto" quantization_config = kwargs_orig.get("quantization_config", None) if quantization_config is not None and not (isinstance(quantization_config, SmoothQuantConfig) or - isinstance(quantization_config, AMPConfig) or + isinstance(quantization_config, MixedPrecisionConfig) or isinstance(quantization_config, WeightOnlyQuantConfig) ): kwargs["quantization_config"] = kwargs_orig["quantization_config"] - if isinstance(quantization_config, AMPConfig): + if isinstance(quantization_config, MixedPrecisionConfig): config.torch_dtype=torch.bfloat16 has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map diff --git a/intel_extension_for_transformers/transformers/utils/__init__.py b/intel_extension_for_transformers/transformers/utils/__init__.py index c4eae8a076a..725ec6924be 100644 --- a/intel_extension_for_transformers/transformers/utils/__init__.py +++ b/intel_extension_for_transformers/transformers/utils/__init__.py @@ -18,7 +18,7 @@ """Utils for optimization.""" from .quantization_config import ( - AMPConfig, + MixedPrecisionConfig, BitsAndBytesConfig, SmoothQuantConfig, WeightOnlyQuantConfig, diff --git a/intel_extension_for_transformers/transformers/utils/quantization_config.py b/intel_extension_for_transformers/transformers/utils/quantization_config.py index 9653c1a0dbf..87d3837cfa5 100644 --- a/intel_extension_for_transformers/transformers/utils/quantization_config.py +++ b/intel_extension_for_transformers/transformers/utils/quantization_config.py @@ -217,7 +217,7 @@ def to_diff_dict(self) -> Dict[str, Any]: @dataclass -class AMPConfig: +class MixedPrecisionConfig: dtype: str = "bfloat16" @dataclass diff --git a/tests/test_quantization.py b/tests/test_quantization.py index 140fbfc087a..c9d027ea8d2 100644 --- a/tests/test_quantization.py +++ b/tests/test_quantization.py @@ -290,7 +290,7 @@ def test_quantization_for_llm(self): model_name_or_path = "facebook/opt-125m" tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) from intel_extension_for_transformers.transformers import ( - AMPConfig, + MixedPrecisionConfig, WeightOnlyQuantConfig, SmoothQuantConfig, BitsAndBytesConfig @@ -301,14 +301,14 @@ def test_quantization_for_llm(self): dummy_input = fp32_model.dummy_inputs["input_ids"] # smooth-quant - #sq_config = SmoothQuantConfig( - # tokenizer=tokenizer, # either two of one, tokenizer or calib_func - # calib_iters=5 - # ) - #q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, - # quantization_config=sq_config - # ) - #self.assertTrue(isinstance(q_model, torch.jit.ScriptModule)) + sq_config = SmoothQuantConfig( + tokenizer=tokenizer, # either two of one, tokenizer or calib_func + calib_iters=5 + ) + q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + quantization_config=sq_config + ) + self.assertTrue(isinstance(q_model, torch.jit.ScriptModule)) # weight-only woq_config = WeightOnlyQuantConfig() woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, @@ -317,7 +317,7 @@ def test_quantization_for_llm(self): output = woq_model(dummy_input) self.assertTrue(float(output[0][0][0][0]), -7.139640808105469) # amp - amp_config = AMPConfig() + amp_config = MixedPrecisionConfig() amp_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, quantization_config=amp_config ) From caa470442baf3c1e219154a4c6ca2831b313c6ce Mon Sep 17 00:00:00 2001 From: changwangss Date: Sun, 17 Sep 2023 23:58:35 -0700 Subject: [PATCH 10/19] fix pylint Signed-off-by: changwangss --- intel_extension_for_transformers/transformers/__init__.py | 2 +- .../transformers/utils/quantization_config.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/intel_extension_for_transformers/transformers/__init__.py b/intel_extension_for_transformers/transformers/__init__.py index 18ce4ef6e87..1a13a84ffec 100644 --- a/intel_extension_for_transformers/transformers/__init__.py +++ b/intel_extension_for_transformers/transformers/__init__.py @@ -41,7 +41,7 @@ from .pruning import SUPPORTED_PRUNING_MODE, PrunerConfig, PruningMode from .quantization import SUPPORTED_QUANT_MODE, QuantizationMode from .utils import ( - AMPConfig, + MixedPrecisionConfig, BitsAndBytesConfig, SmoothQuantConfig, WeightOnlyQuantConfig, diff --git a/intel_extension_for_transformers/transformers/utils/quantization_config.py b/intel_extension_for_transformers/transformers/utils/quantization_config.py index 87d3837cfa5..1d5edc3c658 100644 --- a/intel_extension_for_transformers/transformers/utils/quantization_config.py +++ b/intel_extension_for_transformers/transformers/utils/quantization_config.py @@ -117,7 +117,7 @@ def quantization_method(self): @classmethod def from_dict(cls, config_dict, return_unused_kwargs, **kwargs): """ - Instantiates a [`WeightOnlyConfig`] from a Python dictionary of parameters. + Instantiates a [`WeightOnlyQuantConfig`] from a Python dictionary of parameters. Args: config_dict (`Dict[str, Any]`): @@ -129,7 +129,7 @@ def from_dict(cls, config_dict, return_unused_kwargs, **kwargs): Additional parameters from which to initialize the configuration object. Returns: - [`WeightOnlyConfig`]: The configuration object instantiated from those parameters. + [`WeightOnlyQuantConfig`]: The configuration object instantiated from those parameters. """ config = cls(**config_dict) @@ -181,7 +181,7 @@ def to_json_string(self, use_diff: bool = True) -> str: Args: use_diff (`bool`, *optional*, defaults to `True`): - If set to `True`, only the difference between the config instance and the default `WeightOnlyConfig()` + If set to `True`, only the difference between the config instance and the default `WeightOnlyQuantConfig()` is serialized to JSON string. Returns: @@ -204,7 +204,7 @@ def to_diff_dict(self) -> Dict[str, Any]: config_dict = self.to_dict() # get the default config dict - default_config_dict = WeightOnlyConfig().to_dict() + default_config_dict = WeightOnlyQuantConfig().to_dict() serializable_config_dict = {} From 1c1834900bd070892ac3ab0b67fab05752f399a8 Mon Sep 17 00:00:00 2001 From: changwangss Date: Mon, 18 Sep 2023 00:18:19 -0700 Subject: [PATCH 11/19] fix import Signed-off-by: changwangss --- .../transformers/utils/quantization_config.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/intel_extension_for_transformers/transformers/utils/quantization_config.py b/intel_extension_for_transformers/transformers/utils/quantization_config.py index 1d5edc3c658..b5a528e5415 100644 --- a/intel_extension_for_transformers/transformers/utils/quantization_config.py +++ b/intel_extension_for_transformers/transformers/utils/quantization_config.py @@ -23,7 +23,6 @@ from typing import Any, Optional, Dict, Union from .utility import LazyImport from transformers import BitsAndBytesConfig -from intel_extension_for_transformers.llm.quantization.utils import convert_dtype_2_str torch = LazyImport("torch") class WeightOnlyQuantConfig: @@ -42,6 +41,7 @@ def __init__( algorithm="RTN", **kwargs, ): + from intel_extension_for_transformers.llm.quantization.utils import convert_dtype_2_str self.llm_int8_skip_modules = llm_int8_skip_modules if llm_int8_skip_modules else [] self.weight_dtype = weight_dtype self.scale_dtype = scale_dtype @@ -181,7 +181,8 @@ def to_json_string(self, use_diff: bool = True) -> str: Args: use_diff (`bool`, *optional*, defaults to `True`): - If set to `True`, only the difference between the config instance and the default `WeightOnlyQuantConfig()` + If set to `True`, only the difference between the config instance and the default + `WeightOnlyQuantConfig()` is serialized to JSON string. Returns: From 4ef0a790c5bb80e545af8c4639df02d84d36a488 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Mon, 18 Sep 2023 16:21:58 +0800 Subject: [PATCH 12/19] Fixed shape error for weight-only quantization op Signed-off-by: Cheng, Penghui --- .../llm/quantization/nn/modules.py | 9 +++++++-- tests/test_weight_only.py | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/intel_extension_for_transformers/llm/quantization/nn/modules.py b/intel_extension_for_transformers/llm/quantization/nn/modules.py index 3a3ddf0d283..278ab0bf144 100644 --- a/intel_extension_for_transformers/llm/quantization/nn/modules.py +++ b/intel_extension_for_transformers/llm/quantization/nn/modules.py @@ -18,6 +18,8 @@ import os import torch +from functools import reduce +from operator import mul from peft.tuners.lora import LoraLayer from ..autograd import matmul_4bit @@ -98,13 +100,16 @@ def forward(self, x: torch.Tensor): if getattr(self.weight, 'quant_state', None) is None: print('FP4 quantization state not initialized. Please call .quantize_weights().') - m = x.size()[0] + shape = list(x.size()) + m = reduce(mul, shape[0:-1]) out = torch.zeros(m, self.out_features, dtype=x.dtype) bias = None if self.bias is None else self.bias.data torch.ops.weight_only_jblasop.qbits_linear( - x, self.weight.data, bias, out, + x.view(m, shape[-1]), self.weight.data, bias, out, self.out_features, self.bias is not None, self.compute_dtype, self.weight_dtype ) + shape[-1] = self.out_features + out = out.view(shape) return out diff --git a/tests/test_weight_only.py b/tests/test_weight_only.py index 291a18672c5..dd9c1af54b0 100644 --- a/tests/test_weight_only.py +++ b/tests/test_weight_only.py @@ -51,7 +51,7 @@ def test_int4(self): model = M(with_bias=bias) with torch.no_grad(): model.linear.weight = torch.nn.Parameter(revert_wei) - activation = torch.rand(1, 32, dtype=torch.float) + activation = torch.rand(1, 5, 32, dtype=torch.float) output = model(activation) with torch.no_grad(): model.linear.weight = torch.nn.Parameter(raw_wei) From 34d2354e3dab2009369863c98a295164cc3016d2 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Mon, 18 Sep 2023 16:56:00 +0800 Subject: [PATCH 13/19] Fixed UT error for weight-only quantization Signed-off-by: Cheng, Penghui --- .../llm/quantization/config/__init__.py | 3 - .../config/quantization_config.py | 212 ------------------ tests/test_weight_only.py | 10 +- 3 files changed, 5 insertions(+), 220 deletions(-) delete mode 100644 intel_extension_for_transformers/llm/quantization/config/quantization_config.py diff --git a/intel_extension_for_transformers/llm/quantization/config/__init__.py b/intel_extension_for_transformers/llm/quantization/config/__init__.py index 1df7ead0cec..18896e7b549 100644 --- a/intel_extension_for_transformers/llm/quantization/config/__init__.py +++ b/intel_extension_for_transformers/llm/quantization/config/__init__.py @@ -14,6 +14,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - - -from .quantization_config import WeightOnlyConfig \ No newline at end of file diff --git a/intel_extension_for_transformers/llm/quantization/config/quantization_config.py b/intel_extension_for_transformers/llm/quantization/config/quantization_config.py deleted file mode 100644 index 6845774653d..00000000000 --- a/intel_extension_for_transformers/llm/quantization/config/quantization_config.py +++ /dev/null @@ -1,212 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import copy -import json -import os -import torch -from ..utils import convert_dtype_2_str -from typing import Any, Dict, Union - - -class WeightOnlyConfig: - def __init__( - self, - llm_int8_skip_modules=None, - compute_dtype=None, - weight_dtype="int4_fullrange", # int8 int4_clip, int4_fullrange fp4_e2m1_bnb fp4_e2m1 nf4 - scale_dtype="fp32", # Now only fp32 - mse_range=False, - use_double_quant=False, - double_quant_dtype="int8", # reserve for double quant - double_quant_scale_dtype="fp32", # reserve for double quant - group_size=None, - scheme="sym", - **kwargs, - ): - self.llm_int8_skip_modules = llm_int8_skip_modules if llm_int8_skip_modules else [] - self.weight_dtype = weight_dtype - self.scale_dtype = scale_dtype - self.mse_range = mse_range - self.use_double_quant = use_double_quant - self.double_quant_dtype = double_quant_dtype - self.double_quant_scale_dtype = double_quant_scale_dtype - self.scheme = scheme - - if group_size is None: - self.group_size = 32 - else: - self.group_size = group_size - if compute_dtype is None: - self.compute_dtype = "fp32" - elif isinstance(compute_dtype, str): - self.compute_dtype = compute_dtype - elif isinstance(compute_dtype, torch.dtype): - self.compute_dtype = convert_dtype_2_str(compute_dtype) - else: - raise ValueError("bit4_compute_dtype must be a string or a torch.dtype") - - self.post_init() - - def post_init(self): - r""" - Safety checker that arguments are correct - also replaces some NoneType arguments with their default values. - """ - - if self.llm_int8_skip_modules is not None and not isinstance(self.llm_int8_skip_modules, list): - raise ValueError("llm_int8_skip_modules must be a list of strings") - - if self.compute_dtype is not None and self.compute_dtype not in ['fp32', 'bf16', 'int8']: - raise ValueError("compute_dtype must be 'fp32', 'bf16', 'int8'.") - - if self.weight_dtype not in ['int8', 'int4_fullrange', 'int4_clip', 'nf4', 'fp4_e2m1_bnb', 'fp4_e2m1']: - raise ValueError(f"weight_dtype must be a string in " - f"'int8', 'int4_fullrange', 'int4_clip', 'nf4', 'fp4_e2m1_bnb', 'fp4_e2m1'") - - if self.scale_dtype not in ["fp32"]: - raise ValueError("scale_dtype must be a string in 'fp32'") - - if not isinstance(self.mse_range, bool): - raise ValueError("mse_range must be a boolean") - - if not isinstance(self.use_double_quant, bool): - raise ValueError("use_double_quant must be a boolean") - - if self.use_double_quant and not isinstance(self.double_quant_dtype, str): - raise ValueError("double_quant_dtype must be a string") - - if self.use_double_quant and not isinstance(self.double_quant_scale_dtype, str): - raise ValueError("double_quant_scale_dtype must be a string") - - if not isinstance(self.group_size, int): - raise ValueError("group_size must be a int") - - if not isinstance(self.scheme, str): - raise ValueError("scheme must be a string") - - def quantization_method(self): - r""" - This method returns the quantization method used for the model. - """ - if self.weight_dtype == 8: - return "s8" - elif self.weight_dtype == 4 and self.weight_dtype == "s4fullrange": - return "s4fullrange" - else: - raise ValueError("Only support int8 and int4 quantization now!") - - @classmethod - def from_dict(cls, config_dict, return_unused_kwargs, **kwargs): - """ - Instantiates a [`WeightOnlyConfig`] from a Python dictionary of parameters. - - Args: - config_dict (`Dict[str, Any]`): - Dictionary that will be used to instantiate the configuration object. - return_unused_kwargs (`bool`): - Whether or not to return a list of unused keyword arguments. Used for `from_pretrained` method in - `PreTrainedModel`. - kwargs (`Dict[str, Any]`): - Additional parameters from which to initialize the configuration object. - - Returns: - [`WeightOnlyConfig`]: The configuration object instantiated from those parameters. - """ - - config = cls(**config_dict) - - to_remove = [] - for key, value in kwargs.items(): - if hasattr(config, key): - setattr(config, key, value) - to_remove.append(key) - for key in to_remove: - kwargs.pop(key, None) - - if return_unused_kwargs: - return config, kwargs - else: - return config - - def to_json_file(self, json_file_path: Union[str, os.PathLike]): - """ - Save this instance to a JSON file. - - Args: - json_file_path (`str` or `os.PathLike`): - Path to the JSON file in which this configuration instance's parameters will be saved. - """ - with open(json_file_path, "w", encoding="utf-8") as writer: - config_dict = self.to_dict() - json_string = json.dumps(config_dict, indent=2, sort_keys=True) + "\n" - - writer.write(json_string) - - def to_dict(self) -> Dict[str, Any]: - """ - Serializes this instance to a Python dictionary. Returns: - `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance. - """ - - output = copy.deepcopy(self.__dict__) - output["compute_dtype"] = str(output["compute_dtype"]).split(".")[1] - - return output - - def __repr__(self): - return f"{self.__class__.__name__} {self.to_json_string()}" - - def to_json_string(self, use_diff: bool = True) -> str: - """ - Serializes this instance to a JSON string. - - Args: - use_diff (`bool`, *optional*, defaults to `True`): - If set to `True`, only the difference between the config instance and the default `WeightOnlyConfig()` - is serialized to JSON string. - - Returns: - `str`: String containing all the attributes that make up this configuration instance in JSON format. - """ - if use_diff is True: - config_dict = self.to_diff_dict() - else: - config_dict = self.to_dict() - return json.dumps(config_dict, indent=2, sort_keys=True) + "\n" - - def to_diff_dict(self) -> Dict[str, Any]: - """ - Removes all attributes from config which correspond to the default config attributes for better readability and - serializes to a Python dictionary. - - Returns: - `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, - """ - config_dict = self.to_dict() - - # get the default config dict - default_config_dict = WeightOnlyConfig().to_dict() - - serializable_config_dict = {} - - # only serialize values that differ from the default config - for key, value in config_dict.items(): - if value != default_config_dict[key]: - serializable_config_dict[key] = value - - return serializable_config_dict \ No newline at end of file diff --git a/tests/test_weight_only.py b/tests/test_weight_only.py index dd9c1af54b0..90d2ba54b19 100644 --- a/tests/test_weight_only.py +++ b/tests/test_weight_only.py @@ -4,7 +4,7 @@ from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM from intel_extension_for_transformers.llm.quantization.nn.modules import QuantizedLinearQBits from intel_extension_for_transformers.llm.quantization.utils import convert_to_quantized_model, replace_linear -from intel_extension_for_transformers.llm.quantization.config import WeightOnlyConfig +from intel_extension_for_transformers.transformers import WeightOnlyQuantConfig class M(torch.nn.Module): @@ -33,7 +33,7 @@ def test_int8(self): activation = torch.rand(1,32, dtype=torch.float) output = model(activation) - config = WeightOnlyConfig(weight_dtype="int8", group_size=32) + config = WeightOnlyQuantConfig(weight_dtype="int8", group_size=32) convert_to_quantized_model(model, config) output_quant = model(activation) print(output) @@ -56,7 +56,7 @@ def test_int4(self): with torch.no_grad(): model.linear.weight = torch.nn.Parameter(raw_wei) - config = WeightOnlyConfig(weight_dtype="int4_fullrange", group_size=32) + config = WeightOnlyQuantConfig(weight_dtype="int4_fullrange", group_size=32) convert_to_quantized_model(model, config) output_quant = model(activation) print(output) @@ -80,7 +80,7 @@ def test_int4(self): # return x # model = LinearPredictor() - # replace_linear(model, None, None, WeightOnlyConfig(weight_dtype='int4_fullrange')) + # replace_linear(model, None, None, WeightOnlyQuantConfig(weight_dtype='int4_fullrange')) # lossfn = torch.nn.MSELoss() # optimizer = torch.optim.SGD([p for p in model.parameters() if p.requires_grad], lr=1e-3) # batch_size = 16 @@ -108,7 +108,7 @@ def test_auto_model(self): self.assertTrue(len(module_list) > 0) def test_auto_model_with_config(self): - config = WeightOnlyConfig() + config = WeightOnlyQuantConfig() model = AutoModelForCausalLM.from_pretrained(llama_model_path, quantization_config=config) module_list = [] for name, module in model.named_modules(): From 14df0f3533a8dfc2f315a8a8eae09ace9fb1e0f0 Mon Sep 17 00:00:00 2001 From: changwangss Date: Mon, 18 Sep 2023 04:01:48 -0700 Subject: [PATCH 14/19] improve the example Signed-off-by: changwangss --- .../text-generation/quantization/README.md | 155 +++++++++--------- .../text-generation/quantization/build_env.sh | 118 +++++++++++++ .../quantization/requirements.txt | 1 + .../quantization/run_generation.py | 143 ++++++++-------- .../transformers/modeling/modeling_auto.py | 128 +++++---------- 5 files changed, 308 insertions(+), 237 deletions(-) create mode 100644 examples/huggingface/pytorch/text-generation/quantization/build_env.sh diff --git a/examples/huggingface/pytorch/text-generation/quantization/README.md b/examples/huggingface/pytorch/text-generation/quantization/README.md index 58a25f7fd2b..204f31b69cc 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/README.md +++ b/examples/huggingface/pytorch/text-generation/quantization/README.md @@ -6,106 +6,101 @@ We provide the inference benchmarking script `run_generation.py` for [EleutherAI # Prerequisite​ ## 1. Create Environment​ -If you want to use Pytorch & Intel-extension-for-pytorch version 2.0.1, please -``` -pip install -r requirements.txt -``` -If you want to use Pytorch & Intel-extension-for-pytorch version 2.1, the dependent packages are listed in requirements, we recommend create environment as the following steps. +Pytorch & Intel-extension-for-pytorch version 2.1 is required, the dependent packages are listed in requirements, we recommend create environment as the following steps. ```bash -WORK_DIR=$PWD -# GCC 12.3 is required, please set it firstly -# Create environment (conda recommended) -conda create -n llm python=3.9 -y -# install deps, please try gcc, gxx 12.2 if 12.3 doesn't find from conda -conda install gcc=12.3 gxx=12.3 cxx-compiler -c conda-forge -y -conda install cmake ninja mkl mkl-include -y -conda install gperftools -c conda-forge -y - -# Install PyTorch -python -m pip install https://download.pytorch.org/whl/nightly/cpu/torch-2.1.0.dev20230711%2Bcpu-cp39-cp39-linux_x86_64.whl - -# Install IPEX with semi-compiler, require gcc 12.3 or 12.2 -rm -rf llvm-project && mkdir llvm-project && cd llvm-project -wget https://github.com/llvm/llvm-project/releases/download/llvmorg-16.0.6/cmake-16.0.6.src.tar.xz -wget https://github.com/llvm/llvm-project/releases/download/llvmorg-16.0.6/llvm-16.0.6.src.tar.xz -tar -xf cmake-16.0.6.src.tar.xz && mv cmake-16.0.6.src cmake -tar -xf llvm-16.0.6.src.tar.xz && mv llvm-16.0.6.src llvm -mkdir build && cd build -cmake ../llvm -DCMAKE_INSTALL_PREFIX=${PWD}/_install/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_ENABLE_TERMINFO=OFF -DLLVM_INCLUDE_BENCHMARKS=OFF -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" -make install -j$(nproc) -ln -s ${PWD}/_install/llvm/bin/llvm-config ${CONDA_PREFIX}/bin/llvm-config-13 -cd ../../ - -git clone --branch llm_feature_branch https://github.com/intel/intel-extension-for-pytorch.git -cd intel-extension-for-pytorch -git submodule sync && git submodule update --init --recursive -export DNNL_GRAPH_BUILD_COMPILER_BACKEND=1 -export CXXFLAGS="${CXXFLAGS} -D__STDC_FORMAT_MACROS" -python setup.py install -cd ../ - -# disable semi-compiler to avoid accuracy regression for mpt and neural-chat-v1-1 models, other models don't need it. -export _DNNL_DISABLE_COMPILER_BACKEND=1 - -# Install neural-compressor -git clone https://github.com/intel/neural-compressor.git -cd neural-compressor +conda create -n llm python=3.9 - +conda activate llm +bash build_env.sh +git clone https://github.com/intel/intel-extension-for-transformers.git +cd intel-extension-for-transformers pip install -r requirements.txt python setup.py install +``` +> Note: +> Disable semi-compiler to avoid accuracy regression for mpt and neural-chat-v1-1 models, other > models don't need it. +> `export _DNNL_DISABLE_COMPILER_BACKEND=1` -# Install lm_eval -pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@83dbfbf6070324f3e5872f63e49d49ff7ef4c9b3 -# Install others deps -pip install transformers optimum-intel cpuid accelerate datasets sentencepiece protobuf==3.20.3 -```` -We use the GPTJ defination script [modeling_gptj.py](https://github.com/intel/intel-extension-for-transformers/blob/main/intel_extension_for_transformers/transformers/modeling/gptj/modeling_gptj.py) in `run_generation.py`. Here is a little change to success trace. -```diff -# Line 602 in modeling_gptj.py on transformers 4.28.1 +> Note: if `ImportError: /lib64/libstdc++.so.6: version ``GLIBCXX_3.4.29`` not found` error raised when import intel-extension-for-pytorch, it is due to the high gcc library request, there is the solution to find the correct version. +> ```bash +> find $CONDA_PREFIX | grep libstdc++.so.6 +> export LD_PRELOAD=:${LD_PRELOAD} +> ``` -- position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device) -+ position_ids = torch.arange(past_length, torch.tensor(input_shape[-1]) + torch.tensor(past_length), dtype=torch.long, device=device) -``` -The changes for `llama` series models in `modeling_llama.py`, `dolly_v2_3b` series models in `modeling_gpt_neox.py`, `bloom` series models in `modeling_bloom.py` and `opt` series models in `modeling_opt.py` are similar to the above. # Run +We support compression technologies such as `MixedPrecision`, `SmoothQuant` and `WeightOnlyQuant` with `RTN/AWQ/TEQ/GPTQ` algorithms, `BitsAndBytes` based transformers also works, the followings are command to show how to use it. -## 1. Quantization +## 1. Performance ``` bash -# --int8 is used for int8 only. -# --int8_bf16_mixed is used for int8 mixed bfloat16 precision. -python run_generation.py \ - --model EleutherAI/gpt-j-6b \ - --quantize \ - --sq \ - --alpha 1.0 \ - --int8_bf16_mixed \ - --ipex -``` -## 2. Performance -```bash -# --int8 is used for int8 only. -# --int8_bf16_mixed is used for int8 mixed bfloat16 precision. export KMP_BLOCKTIME=1 export KMP_SETTINGS=1 export KMP_AFFINITY=granularity=fine,compact,1,0 export LD_PRELOAD=${CONDA_PREFIX}/lib/libiomp5.so export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so +# fp32 OMP_NUM_THREADS= numactl -m -C python run_generation.py \ --model EleutherAI/gpt-j-6b \ - --benchmark \ - --int8_bf16_mixed \ - --ipex + --benchmark +# mixedprecision +OMP_NUM_THREADS= numactl -m -C python run_generation.py \ + --model EleutherAI/gpt-j-6b \ + --mixed_precision \ + --benchmark +# smoothquant +# --int8_bf16_mixed is used for int8 mixed bfloat16 precision. +OMP_NUM_THREADS= numactl -m -C python run_generation.py \ + --model EleutherAI/gpt-j-6b \ + --sq \ + --alpha 1.0 \ + --int8 \ + --benchmark +# weightonlyquant +OMP_NUM_THREADS= numactl -m -C python run_generation.py \ + --model EleutherAI/gpt-j-6b \ + --woq \ + --benchmark +# bitsandbytes +OMP_NUM_THREADS= numactl -m -C python run_generation.py \ + --model EleutherAI/gpt-j-6b \ + --bitsandbytes \ + --benchmark + ``` -## 3. Accuracy + +## 2. Accuracy ```bash -# --int8 is used for int8 only. +# fp32 +python run_generation.py \ + --model EleutherAI/gpt-j-6b \ + --accuracy \ + --tasks "lambada_openai" +# mixedprecision +python run_generation.py \ + --model EleutherAI/gpt-j-6b \ + --mixed_precision \ + --accuracy \ + --tasks "lambada_openai" +# smoothquant # --int8_bf16_mixed is used for int8 mixed bfloat16 precision. python run_generation.py \ - --model EleutherAI/gpt-j-6b \ - --accuracy \ - --int8_bf16_mixed \ - --ipex \ - --tasks "lambada_openai" + --model EleutherAI/gpt-j-6b \ + --sq \ + --alpha 1.0 \ + --int8 \ + --accuracy \ + --tasks "lambada_openai" +# weightonlyquant +python run_generation.py \ + --model EleutherAI/gpt-j-6b \ + --woq \ + --accuracy \ + --tasks "lambada_openai" +# bitsandbytes +python run_generation.py \ + --model EleutherAI/gpt-j-6b \ + --bitsandbytes \ + --accuracy \ + --tasks "lambada_openai" ``` diff --git a/examples/huggingface/pytorch/text-generation/quantization/build_env.sh b/examples/huggingface/pytorch/text-generation/quantization/build_env.sh new file mode 100644 index 00000000000..2e81bd8e8d3 --- /dev/null +++ b/examples/huggingface/pytorch/text-generation/quantization/build_env.sh @@ -0,0 +1,118 @@ +#!/bin/bash + 2 set -x + 3 set -e + 4 + 5 VER_LLVM="llvmorg-16.0.6" + 6 VER_IPEX="7256d0848ba81bb802dd33fca0e33049a751db58" + 7 + 8 # Check existance of required Linux commands + 9 for CMD in conda git nproc make; do + 10 command -v ${CMD} || (echo "Error: Command \"${CMD}\" not found." ; exit 4) + 11 done + 12 + 13 MAX_JOBS_VAR=$(nproc) + 14 if [ ! -z "${MAX_JOBS}" ]; then + 15 MAX_JOBS_VAR=${MAX_JOBS} + 16 fi + 17 + 18 # Save current directory path + 19 BASEFOLDER=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + 20 cd ${BASEFOLDER} + 21 # Checkout individual components + 22 if [ ! -d llvm-project ]; then + 23 git clone https://github.com/llvm/llvm-project.git + 24 fi + 25 if [ ! -d intel-extension-for-pytorch ]; then + 26 git clone https://github.com/intel/intel-extension-for-pytorch.git + 27 fi + 28 + 29 # Checkout required branch/commit and update submodules + 30 cd llvm-project + 31 if [ ! -z ${VER_LLVM} ]; then + 32 git checkout ${VER_LLVM} + 33 fi + 34 git submodule sync + 35 git submodule update --init --recursive + 36 cd .. + 37 cd intel-extension-for-pytorch + 38 if [ ! -z ${VER_IPEX} ]; then + 39 git checkout ${VER_IPEX} + 40 fi + 41 git submodule sync + 42 git submodule update --init --recursive + 43 cd .. + 44 + 45 # Install dependencies + 46 conda install -y gcc==12.3 gxx==12.3 cxx-compiler -c conda-forge + 47 conda update -y sysroot_linux-64 + 48 python -m pip install cmake + 49 python -m pip install https://download.pytorch.org/whl/nightly/cpu/torch-2.1.0.dev20230711%2Bcpu-cp39-cp39-linux_x86_64.whl + 50 ABI=$(python -c "import torch; print(int(torch._C._GLIBCXX_USE_CXX11_ABI))") + 51 + 52 # Compile individual component + 53 export CC=${CONDA_PREFIX}/bin/gcc + 54 export CXX=${CONDA_PREFIX}/bin/g++ + 55 export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH} + +# LLVM +cd llvm-project +LLVM_ROOT="$(pwd)/release" +if [ -d ${LLVM_ROOT} ]; then + rm -rf ${LLVM_ROOT} +fi +if [ -d build ]; then + rm -rf build +fi +mkdir build +cd build +echo "***************************** cmake *****************************" > ../build.log +cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=${ABI}" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_ENABLE_TERMINFO=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_BENCHMARKS=OFF ../llvm 2>&1 | tee -a ../build.log +echo "***************************** build *****************************" >> ../build.log +cmake --build . -j ${MAX_JOBS_VAR} 2>&1 | tee -a ../build.log +echo "**************************** install ****************************" >> ../build.log +cmake -DCMAKE_INSTALL_PREFIX=${LLVM_ROOT} -P cmake_install.cmake 2>&1 | tee -a ../build.log +#xargs rm -rf < install_manifest.txt +cd .. +rm -rf build +ln -s ${LLVM_ROOT}/bin/llvm-config ${LLVM_ROOT}/bin/llvm-config-13 +export PATH=${LLVM_ROOT}/bin:$PATH +export LD_LIBRARY_PATH=${LLVM_ROOT}/lib:$LD_LIBRARY_PATH +cd .. +# Intel® Extension for PyTorch* +cd intel-extension-for-pytorch +python -m pip install -r requirements.txt +export LLVM_DIR=${LLVM_ROOT}/lib/cmake/llvm +export DNNL_GRAPH_BUILD_COMPILER_BACKEND=1 +CXXFLAGS_BK=${CXXFLAGS} +export CXXFLAGS="${CXXFLAGS} -D__STDC_FORMAT_MACROS" +python setup.py clean +python setup.py bdist_wheel 2>&1 | tee build.log +export CXXFLAGS=${CXXFLAGS_BK} +unset DNNL_GRAPH_BUILD_COMPILER_BACKEND +unset LLVM_DIR +python -m pip install --force-reinstall dist/*.whl +cd .. + +# Sanity Test +set +x +export LD_PRELOAD=${CONDA_PREFIX}/lib/libstdc++.so +echo "Note: Should you experience \"version \`GLIBCXX_N.N.NN' not found\" error, run command \"export LD_PRELOAD=${CONDA_PREFIX}/lib/libstdc++.so\" and try again." +python -c "import torch; import intel_extension_for_pytorch as ipex; print(f'torch_cxx11_abi: {torch._C._GLIBCXX_USE_CXX11_ABI}'); print(f'torch_version: {torch.__version__}'); print(f'ipex_version: {ipex.__version__}');" +# Install neural-compressor +git clone https://github.com/intel/neural-compressor.git +cd neural-compressor +pip install -r requirements.txt +python setup.py install +cd .. + +# Install intel-extension-for-pytorch +git checkout -b int8_llama2 +pip install -r requirements.txt +python setup.py install +cd .. + +# Install lm_eval +pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@83dbfbf6070324f3e5872f63e49d49ff7ef4c9b3 +# Install others deps +pip install transformers optimum-intel cpuid accelerate datasets sentencepiece protobuf==3.20.3 + diff --git a/examples/huggingface/pytorch/text-generation/quantization/requirements.txt b/examples/huggingface/pytorch/text-generation/quantization/requirements.txt index bd067d4e1e6..4858296fa24 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/requirements.txt +++ b/examples/huggingface/pytorch/text-generation/quantization/requirements.txt @@ -1,5 +1,6 @@ accelerate datasets >= 2.0 +peft protobuf sentencepiece != 0.1.92 torch >= 1.10.0 diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py index b0529869455..02f5600d712 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py @@ -2,22 +2,18 @@ import re import time import json -import os -import pathlib import torch -import types -from pathlib import Path -from datasets import load_dataset, load_from_disk -from torch.nn.functional import pad -from torch.utils.data import DataLoader -from transformers import AutoConfig, AutoTokenizer, PretrainedConfig +from transformers import AutoConfig, AutoTokenizer +from intel_extension_for_transformers.transformers import AutoModelForCausalLM from transformers.utils import check_min_version -import transformers -import numpy as np -from itertools import chain -from optimum.utils import NormalizedConfigManager from optimum.intel.generation.modeling import TSModelForCausalLM +from intel_extension_for_transformers.transformers import ( + MixedPrecisionConfig, + WeightOnlyQuantConfig, + SmoothQuantConfig, + BitsAndBytesConfig +) parser = argparse.ArgumentParser() parser.add_argument( @@ -28,53 +24,74 @@ parser.add_argument( "--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k" ) -parser.add_argument("--dtype", type=str, default="int8") parser.add_argument( "--max-new-tokens", default=32, type=int, help="output max new tokens" ) parser.add_argument("--output_dir", nargs="?", default="./saved_results") -parser.add_argument("--quantize", action="store_true") -parser.add_argument("--alpha", default="auto", help="Smooth quant parameter.") -parser.add_argument( - "--pad_max_length", default=512, type=int, help="Pad input ids to max length." -) parser.add_argument("--int8", action="store_true") parser.add_argument( "--int8_bf16_mixed", action="store_true", help="by default it is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)", ) +parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model") +# ============Benchmark configs============== parser.add_argument("--benchmark", action="store_true") parser.add_argument("--iters", default=100, type=int, help="num iter") parser.add_argument("--num_warmup", default=10, type=int, help="num warmup") +# ============Accuracy configs============== parser.add_argument("--accuracy", action="store_true") -parser.add_argument("--batch_size", default=1, type=int, +parser.add_argument("--batch_size", default=56, type=int, help="batch size num.") parser.add_argument("--save_accuracy_path", default=None, help="Save accuracy results path.") -parser.add_argument("--tasks", nargs='+', default=["winogrande", "copa", "piqa", "rte", "hellaswag", \ - "openbookqa", "lambada_openai", "lambada_standard", "wikitext"], type=str, \ +parser.add_argument("--tasks", nargs='+', default=["lambada_openai"], type=str, \ help="tasks list for accuracy validation") - +# ============MixedPrecision configs============== +parser.add_argument("--mixed_precision", action="store_true") +# ============SmoothQuant configs============== +parser.add_argument("--sq", action="store_true") +parser.add_argument("--alpha", default="0.5", help="Smooth quant parameter.") +# ============WeightOnlyQuant configs=============== +parser.add_argument("--woq", action="store_true") +parser.add_argument("--woq_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ'], + help="Weight-only parameter.") +parser.add_argument("--woq_dtype", type=str, default="int8", + choices=["int8", "int4_clip", "int4_fullrange", "fp4_e2m1_bnb", "fp4_e2m1", "nf4"]) +parser.add_argument("--woq_group_size", type=int, default=-1) +parser.add_argument("--woq_scheme", default="sym") +parser.add_argument("--woq_enable_mse_search", action="store_true") +parser.add_argument("--woq_enable_full_range", action="store_true") +# =============WeightOnlyQuant GPTQ configs==================== + +parser.add_argument("--gptq_actorder", action="store_true", help="Whether to apply the activation order GPTQ heuristic.") +parser.add_argument('--gptq_percdamp', type=float, default=.01, help='Percent of the average Hessian diagonal to use for dampening.') +parser.add_argument('--gptq_block_size', type=int, default=128, help='Block size. sub weight matrix size to run GPTQ.') +parser.add_argument('--gptq_nsamples', type=int, default=128, help='Number of calibration data samples.') +parser.add_argument('--gptq_use_max_length', action="store_true", help='Set all sequence length to be same length of args.gptq_pad_max_length') +parser.add_argument('--gptq_pad_max_length', type=int, default=2048, help='Calibration dataset sequence max length, \ + this should align with your model config, \ + and your dataset builder args: args.pad_max_length') +# ============BitsAndBytes configs============== +parser.add_argument("--bitsandbytes", action="store_true") +# ======================================= args = parser.parse_args() -calib_size = 1 +# transformers version >= 4.32.0 contained the mpt modeling definition. +# https://github.com/huggingface/transformers/blob/main/src/transformers/models/mpt/modeling_mpt.py +check_min_version("4.32.0") -# model +# get model config config = AutoConfig.from_pretrained( args.model, torchscript=True - if args.quantize + if (args.sq or args.woq_algo in ['AWQ', 'TEQ']) else False, # torchscript will force `return_dict=False` to avoid jit errors use_cache=True, # to use kv cache. trust_remote_code=args.trust_remote_code, revision=args.revision, ) -# transformers version >= 4.32.0 contained the mpt modeling definition. -# https://github.com/huggingface/transformers/blob/main/src/transformers/models/mpt/modeling_mpt.py -if config.model_type == "mpt": - check_min_version("4.32.0") # tokenizer if config.model_type == "llama": @@ -83,15 +100,14 @@ else: tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code) -# quantize -if args.quantize: - from intel_extension_for_transformers.transformers import ( - AMPConfig, - WeightOnlyQuantizationConfig, - SmoothQuantConfig, - BitsAndBytesConfig - - ) +# mixedprecision +if args.mixed_precision: + mp_config = MixedPrecisionConfig(dtype="bfloat16") # default is bfloat16 + user_model = AutoModelForCausalLM.from_pretrained(args.model, + quantization_config=mp_config + ) +# smoothquant +elif args.sq: from intel_extension_for_transformers.transformers import AutoModelForCausalLM if re.search("gptj", config.model_type) or re.search( "gpt_neox", config.model_type @@ -113,49 +129,40 @@ op_type_dict=op_type_dict, # default is {} excluded_precisions=excluded_precisions, # default is [] ) - # smooth-quant - q_model = AutoModelForCausalLM.from_pretrained(args.model, + user_model = AutoModelForCausalLM.from_pretrained(args.model, quantization_config=sq_config ) - print("sq done.") - # weight-only - woq_config = WeightOnlyQuantizationConfig(algorithm="RTN", # default is "RTN" - bits=8, # default is 8 - group_size=-1, # default is -1 - scheme="sym", # default is sym - enable_full_range=True # default is True - ) - woq_model = AutoModelForCausalLM.from_pretrained(args.model, + config.save_pretrained(args.output_dir) + user_model.save(args.output_dir) +# weight-only +elif args.woq: + woq_config = WeightOnlyQuantConfig() + user_model = AutoModelForCausalLM.from_pretrained(args.model, quantization_config=woq_config ) - print("woq done.") - # amp - amp_config = AMPConfig(dtype="bfloat16") # default is bfloat16 - amp_model = AutoModelForCausalLM.from_pretrained(args.model, - quantization_config=amp_config - ) - print("amp done.") - # bitsandbytes +# bitsandbytes +elif args.bitsandbytes: bab_config = BitsAndBytesConfig() - bab_model = AutoModelForCausalLM.from_pretrained(args.model, + user_model = AutoModelForCausalLM.from_pretrained(args.model, quantization_config=bab_config ) - print("bitsandbytes done.") - +elif not args.int8 or args.int8_bf16_mixed: + user_model = AutoModelForCausalLM.from_pretrained(args.model, config=config) + # peft + if args.peft_model_id is not None: + from peft import PeftModel + user_model = PeftModel.from_pretrained(user_model, args.peft_model_id) # Generation generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=4) + if args.int8 or args.int8_bf16_mixed: # TorchScript model don't attribute generate method, the wrapper is provided. - if args.ipex: - user_model = TSModelForCausalLM.from_pretrained( - args.output_dir, file_name="best_model.pt", trust_remote_code=args.trust_remote_code - ) - else: - from neural_compressor.utils.pytorch import load - - user_model = load(args.output_dir, user_model) + import intel_extension_for_pytorch as ipex + user_model = TSModelForCausalLM.from_pretrained( + args.output_dir, file_name="best_model.pt", trust_remote_code=args.trust_remote_code + ) if args.benchmark: diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index d5ad2705a7e..8000c6dcaf0 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -30,16 +30,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import copy -from transformers import AutoConfig, PretrainedConfig -from transformers.dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code -from transformers.models.auto.modeling_auto import (MODEL_FOR_CAUSAL_LM_MAPPING, - MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, - MODEL_MAPPING - ) -from transformers.models.auto.auto_factory import _get_model_class +import logging +import torch +import transformers + from intel_extension_for_transformers.transformers.utils.utility import ( LazyImport, generate_dummy_past_key_values, @@ -57,86 +53,38 @@ logger = logging.getLogger(__name__) torch = LazyImport("torch") - -class _BaseAutoModelClass: - # Base class for auto models. - _model_mapping = None +class _BaseQBitsAutoModelClass: + ORIG_MODEL = None @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): import intel_extension_for_transformers.transformers.modeling.modeling_map - config = kwargs.pop("config", None) + load_in_8bit = kwargs.pop("load_in_8bit", False) + load_in_4bit = kwargs.pop("load_in_4bit", False) calib_func = kwargs.pop("calib_func", None) - trust_remote_code = kwargs.pop("trust_remote_code", None) - kwargs["_from_auto"] = True - hub_kwargs_names = [ - "cache_dir", - "code_revision", - "force_download", - "local_files_only", - "proxies", - "resume_download", - "revision", - "subfolder", - "use_auth_token", - ] - hub_kwargs = {name: kwargs.pop(name) for name in hub_kwargs_names if name in kwargs} - - if not isinstance(config, PretrainedConfig): - kwargs_orig = copy.deepcopy(kwargs) - # ensure not to pollute the config object with torch_dtype="auto" - since it's - # meaningless in the context of the config object - torch.dtype values are acceptable - if kwargs.get("torch_dtype", None) == "auto": - _ = kwargs.pop("torch_dtype") - # to not overwrite the quantization_config if config has a quantization_config - - if kwargs.get("quantization_config", None) is not None: - _ = kwargs.pop("quantization_config") - - config, kwargs = AutoConfig.from_pretrained( - pretrained_model_name_or_path, - return_unused_kwargs=True, - trust_remote_code=trust_remote_code, - **hub_kwargs, - **kwargs, - ) + quantization_config = kwargs.pop("quantization_config", None) + if isinstance(quantization_config, MixedPrecisionConfig): + kwargs["torch_dtype"] = torch.bfloat16 + if load_in_8bit or load_in_4bit or quantization_config is not None: + from intel_extension_for_transformers.llm.quantization.utils import convert_to_quantized_model, convert_dtype_2_str + torch_dtype = kwargs.pop("torch_dtype", torch.float32) + + if load_in_4bit: + if quantization_config is None: + quantization_config = WeightOnlyQuantConfig(compute_dtype=torch_dtype, weight_dtype="nf4") + else: + assert "4" in quantization_config.weight_dtype and quantization_config.compute_dtype == torch_dtype, \ + f"Quantization_config.weight_dtype should be 'nf4', 'int4_fullrange', 'int4_clip'," + f"'fp4_e2m1' or 'fp4_e2m1_bnb' and compute_dtype should be {torch_dtype}." + elif load_in_8bit: + if quantization_config is None: + quantization_config = WeightOnlyQuantConfig(compute_dtype=torch_dtype, weight_dtype="int8") + else: + assert quantization_config.weight_dtype == "int8" \ + and quantization_config.compute_dtype == torch_dtype, \ + f"Quantization_config.weight_dtype should be 'int8' and compute_dtype should be {torch_dtype}." - # if torch_dtype=auto was passed here, ensure to pass it on - if kwargs_orig.get("torch_dtype", None) == "auto": - kwargs["torch_dtype"] = "auto" - quantization_config = kwargs_orig.get("quantization_config", None) - if quantization_config is not None and not (isinstance(quantization_config, SmoothQuantConfig) or - isinstance(quantization_config, MixedPrecisionConfig) or - isinstance(quantization_config, WeightOnlyQuantConfig) - ): - kwargs["quantization_config"] = kwargs_orig["quantization_config"] - if isinstance(quantization_config, MixedPrecisionConfig): - config.torch_dtype=torch.bfloat16 - - has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map - has_local_code = type(config) in cls._model_mapping.keys() - trust_remote_code = resolve_trust_remote_code( - trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code - ) - if has_remote_code and trust_remote_code: - class_ref = config.auto_map[cls.__name__] - model_class = get_class_from_dynamic_module( - class_ref, pretrained_model_name_or_path, **hub_kwargs, **kwargs - ) - _ = hub_kwargs.pop("code_revision", None) - model = model_class.from_pretrained( - pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs - ) - elif type(config) in cls._model_mapping.keys(): - model_class = _get_model_class(config, cls._model_mapping) - model = model_class.from_pretrained( - pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs - ) - else: - raise ValueError( - f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n" - f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}." - ) + model = cls.ORIG_MODEL.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) model.eval() if isinstance(quantization_config, WeightOnlyQuantConfig): logger.info("Applying Weight Only Quantization.") @@ -238,15 +186,17 @@ def default_calib_func(model): model, conf, calib_func=calib_func - ).model + ) return model +class AutoModelForCausalLM(_BaseQBitsAutoModelClass): + ORIG_MODEL = transformers.AutoModelForCausalLM + + +class AutoModel(_BaseQBitsAutoModelClass): + ORIG_MODEL = transformers.AutoModel -class AutoModelForCausalLM(_BaseAutoModelClass): - _model_mapping = MODEL_FOR_CAUSAL_LM_MAPPING -class AutoModel(_BaseAutoModelClass): - _model_mapping = MODEL_MAPPING +class AutoModelForSeq2SeqLM(_BaseQBitsAutoModelClass): + ORIG_MODEL = transformers.AutoModelForSeq2SeqLM -class AutoModelForSeq2SeqLM(_BaseAutoModelClass): - _model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING From cc645cf71192c8e32cfa2117c979d30eacaa0300 Mon Sep 17 00:00:00 2001 From: "Wang, Chang" Date: Mon, 18 Sep 2023 19:04:55 +0800 Subject: [PATCH 15/19] Update README.md --- .../pytorch/text-generation/quantization/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/README.md b/examples/huggingface/pytorch/text-generation/quantization/README.md index 204f31b69cc..f4ca1ec3be1 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/README.md +++ b/examples/huggingface/pytorch/text-generation/quantization/README.md @@ -6,7 +6,7 @@ We provide the inference benchmarking script `run_generation.py` for [EleutherAI # Prerequisite​ ## 1. Create Environment​ -Pytorch & Intel-extension-for-pytorch version 2.1 is required, the dependent packages are listed in requirements, we recommend create environment as the following steps. +Pytorch and Intel-extension-for-pytorch version 2.1 are required, the dependent packages are listed in requirements, we recommend create environment as the following steps. ```bash conda create -n llm python=3.9 - @@ -21,7 +21,7 @@ python setup.py install > Disable semi-compiler to avoid accuracy regression for mpt and neural-chat-v1-1 models, other > models don't need it. > `export _DNNL_DISABLE_COMPILER_BACKEND=1` -> Note: if `ImportError: /lib64/libstdc++.so.6: version ``GLIBCXX_3.4.29`` not found` error raised when import intel-extension-for-pytorch, it is due to the high gcc library request, there is the solution to find the correct version. +> Note: If `ImportError: /lib64/libstdc++.so.6: version ``GLIBCXX_3.4.29`` not found` error raised when import intel-extension-for-pytorch, it is due to the high gcc library request, there is the solution to find the correct version. > ```bash > find $CONDA_PREFIX | grep libstdc++.so.6 > export LD_PRELOAD=:${LD_PRELOAD} From 66ae9a887c3ed449e223d1ef2041535725c79e6e Mon Sep 17 00:00:00 2001 From: changwangss Date: Mon, 18 Sep 2023 04:15:11 -0700 Subject: [PATCH 16/19] fix long line Signed-off-by: changwangss --- .../transformers/modeling/modeling_auto.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index 8000c6dcaf0..ce5e668b281 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -66,7 +66,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): if isinstance(quantization_config, MixedPrecisionConfig): kwargs["torch_dtype"] = torch.bfloat16 if load_in_8bit or load_in_4bit or quantization_config is not None: - from intel_extension_for_transformers.llm.quantization.utils import convert_to_quantized_model, convert_dtype_2_str + from intel_extension_for_transformers.llm.quantization.utils import convert_to_quantized_model torch_dtype = kwargs.pop("torch_dtype", torch.float32) if load_in_4bit: From 509e132ba207605456170071b6f2a9591071ceba Mon Sep 17 00:00:00 2001 From: changwangss Date: Mon, 18 Sep 2023 05:32:53 -0700 Subject: [PATCH 17/19] fix import Signed-off-by: changwangss --- .../huggingface/pytorch/text-generation/quantization/README.md | 2 +- tests/requirements.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/README.md b/examples/huggingface/pytorch/text-generation/quantization/README.md index f4ca1ec3be1..67900340ef9 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/README.md +++ b/examples/huggingface/pytorch/text-generation/quantization/README.md @@ -9,7 +9,7 @@ We provide the inference benchmarking script `run_generation.py` for [EleutherAI Pytorch and Intel-extension-for-pytorch version 2.1 are required, the dependent packages are listed in requirements, we recommend create environment as the following steps. ```bash -conda create -n llm python=3.9 - +conda create -n llm python=3.9 -y conda activate llm bash build_env.sh git clone https://github.com/intel/intel-extension-for-transformers.git diff --git a/tests/requirements.txt b/tests/requirements.txt index f988687f602..63ae3963112 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -18,3 +18,4 @@ evaluate wget optimum optimum-intel +peft From d48f055e129588f662c9f7824abadd70f30773c1 Mon Sep 17 00:00:00 2001 From: "Wang, Chang" Date: Mon, 18 Sep 2023 20:47:12 +0800 Subject: [PATCH 18/19] Update README.md --- .../pytorch/text-generation/quantization/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/README.md b/examples/huggingface/pytorch/text-generation/quantization/README.md index 67900340ef9..8bee0702d41 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/README.md +++ b/examples/huggingface/pytorch/text-generation/quantization/README.md @@ -49,7 +49,7 @@ OMP_NUM_THREADS= numactl -m -C python ru --mixed_precision \ --benchmark # smoothquant -# --int8_bf16_mixed is used for int8 mixed bfloat16 precision. +# [alternative] --int8 is used for int8 only, --int8_bf16_mixed is used for int8 mixed bfloat16 precision. OMP_NUM_THREADS= numactl -m -C python run_generation.py \ --model EleutherAI/gpt-j-6b \ --sq \ @@ -83,7 +83,7 @@ python run_generation.py \ --accuracy \ --tasks "lambada_openai" # smoothquant -# --int8_bf16_mixed is used for int8 mixed bfloat16 precision. +# [alternative] --int8 is used for int8 only, --int8_bf16_mixed is used for int8 mixed bfloat16 precision. python run_generation.py \ --model EleutherAI/gpt-j-6b \ --sq \ From 3b4080b1ddc97f0a108d5da713920d9524b79be0 Mon Sep 17 00:00:00 2001 From: "Wang, Chang" Date: Mon, 18 Sep 2023 22:12:18 +0800 Subject: [PATCH 19/19] Update test_quantization.py --- tests/test_quantization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_quantization.py b/tests/test_quantization.py index c9d027ea8d2..a1c0988560d 100644 --- a/tests/test_quantization.py +++ b/tests/test_quantization.py @@ -308,7 +308,7 @@ def test_quantization_for_llm(self): q_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, quantization_config=sq_config ) - self.assertTrue(isinstance(q_model, torch.jit.ScriptModule)) + self.assertTrue(isinstance(q_model.model, torch.jit.ScriptModule)) # weight-only woq_config = WeightOnlyQuantConfig() woq_model = AutoModelForCausalLM.from_pretrained(model_name_or_path,