Skip to content

Commit

Permalink
[INFER][LLM] Add the AutoPredictor for inference (PaddlePaddle#9445)
Browse files Browse the repository at this point in the history
* add the AutoPredictor

* decoupling the model loading and predictor loading

* polish the AutoPredictor and AutoModel
  • Loading branch information
zeroRains authored Dec 3, 2024
1 parent 7a221cc commit 2c1387f
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 74 deletions.
2 changes: 1 addition & 1 deletion llm/predict/export_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def main():
tensor_parallel_rank = hcg.get_model_parallel_rank()

# set predictor type
predictor = create_predictor(predictor_args, model_args, tensor_parallel_degree, tensor_parallel_rank)
predictor = create_predictor(predictor_args, model_args)
predictor.model.eval()

predictor.model.to_static(
Expand Down
171 changes: 99 additions & 72 deletions llm/predict/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
ChatGLMv2Tokenizer,
Llama3Tokenizer,
LlamaTokenizer,
PretrainedModel,
PretrainedConfig,
PretrainedTokenizer,
)
from paddlenlp.trl import llm_utils
Expand Down Expand Up @@ -245,11 +245,9 @@ def predict(self, input_texts: str | list[str], return_tokens=False):


class DygraphPredictor(BasePredictor):
def __init__(
self, config: PredictorArgument, model: PretrainedModel = None, tokenizer: PretrainedTokenizer = None
):
def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer = None, **kwargs):
super().__init__(config, tokenizer)
self.model = model
self.model = kwargs.get("model", None)
if config.lora_path is not None:
lora_config = LoRAConfig.from_pretrained(config.lora_path)
dtype = lora_config.dtype
Expand Down Expand Up @@ -326,7 +324,7 @@ def stream_predict(self, inputs: dict[str, paddle.Tensor]):


class StaticGraphPredictor(BasePredictor):
def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer = None):
def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer = None, **kwargs):
super().__init__(config, tokenizer)

inference_config = paddle.inference.Config(self.config.model_name_or_path, self.config.model_prefix)
Expand Down Expand Up @@ -623,14 +621,16 @@ def _preprocess(self, source):
return inputs


class StaticInferencePredictor(InferencePredictorMixin):
class StaticGraphInferencePredictor(InferencePredictorMixin):
def __init__(
self,
config: PredictorArgument,
cache_kvs_shape: list[list[int]],
tokenizer: PretrainedTokenizer = None,
**kwargs,
):
self.cache_kvs_shape = cache_kvs_shape
self.cache_kvs_shape = kwargs.get("cache_kvs_shape", None)
if self.cache_kvs_shape is None:
raise ValueError("cache_kvs_shape should be provided for StaticGraphInferencePredictor")
InferencePredictorMixin.__init__(self, config, tokenizer)

self.predictor = self._create_predictor(config)
Expand Down Expand Up @@ -701,9 +701,12 @@ class DygraphInferencePredictor(InferencePredictorMixin):
def __init__(
self,
config: PredictorArgument,
model: PretrainedModel = None,
tokenizer: PretrainedTokenizer = None,
**kwargs,
):
model = kwargs.get("model", None)
if model is None:
raise ValueError("model should be provided for DygraphInferencePredictor")
self.cache_kvs_shape = model.get_cache_kvs_shape(model.config, config.batch_size, config.total_max_length)
InferencePredictorMixin.__init__(self, config, tokenizer)
self.model = model
Expand Down Expand Up @@ -982,12 +985,10 @@ def _preprocess(self, input_text: list[str]):


class DygraphBlockInferencePredictor(BlockInferencePredictorMixin):
def __init__(
self,
config: PredictorArgument,
model: PretrainedModel = None,
tokenizer: PretrainedTokenizer = None,
):
def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer = None, **kwargs):
model = kwargs.get("model", None)
if model is None:
raise ValueError("model should be provided for DygraphBlockInferencePredictor")
self.cache_kvs_shape = model.get_cache_kvs_shape(model.config, config.batch_size)
BlockInferencePredictorMixin.__init__(self, config, tokenizer)

Expand Down Expand Up @@ -1079,14 +1080,16 @@ def predict(self, input_texts: list[str], return_tokens=False):
return outputs


class StaticBlockInferencePredictor(BlockInferencePredictorMixin):
class StaticGraphBlockInferencePredictor(BlockInferencePredictorMixin):
def __init__(
self,
config: PredictorArgument,
cache_kvs_shape: list[list[int]],
tokenizer: PretrainedTokenizer = None,
**kwargs,
):
self.cache_kvs_shape = cache_kvs_shape
self.cache_kvs_shape = kwargs.get("cache_kvs_shape", None)
if self.cache_kvs_shape is None:
raise ValueError("cache_kvs_shape should be provided for StaticGraphBlockInferencePredictor")
BlockInferencePredictorMixin.__init__(self, config, tokenizer)

self._create_predictor(config)
Expand Down Expand Up @@ -1224,21 +1227,71 @@ def predict(self, input_texts: list[str], return_tokens=False):
return outputs


def get_ptq_multicards_num(directory):
count = 0
if os.path.exists(directory):
prefix = "act_scales_"
for filename in os.listdir(directory):
if filename.startswith(prefix):
count += 1
return count
class AutoPredictor:
def __init__(self, *args, **kwargs):
raise EnvironmentError(
f"{self.__class__.__name__} is designed to be instantiated "
f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path).`"
)

@classmethod
def create_predictor(
cls,
predictor_args: PredictorArgument,
config: PretrainedConfig,
model_args: ModelArgument,
tokenizer: PretrainedTokenizer = None,
**kwargs
):
"""
Create a predictor
Args:
predictor_args (PredictorArgument): The predictor arguments.
config (PretrainedConfig): The model configuration.
model_args (ModelArgument): The model arguments.
tokenizer (PretrainedTokenizer): The tokenizer.
**kwargs: Additional keyword arguments.
Returns:
Predictor: The predictor.
"""
model = kwargs.pop("model", None)
cache_kvs_shape = None

# static or dynamic
execute_mode = "Dygraph" if predictor_args.mode == "dynamic" else "StaticGraph"

# infer/ no infer
if predictor_args.inference_model:
# block/no block
if predictor_args.block_attn:
attn_type = "Block"
else:
attn_type = ""
inference_mode = f"{attn_type}Inference"

if predictor_args.mode == "static":
cache_kvs_shape = model.get_cache_kvs_shape(
config, predictor_args.batch_size, predictor_args.total_max_length
)
else:
inference_mode = ""

predictor_class_name = execute_mode + inference_mode + "Predictor"

import_class = sys.modules[__name__]

# import class
predictor_class = getattr(import_class, predictor_class_name)

# instance
predictor = predictor_class(predictor_args, tokenizer=tokenizer, model=model, cache_kvs_shape=cache_kvs_shape)
return predictor


def create_predictor(
predictor_args: PredictorArgument,
model_args: ModelArgument,
tensor_parallel_degree: int = 1,
tensor_parallel_rank: int = 0,
):
tokenizer = AutoTokenizer.from_pretrained(
predictor_args.model_name_or_path,
Expand Down Expand Up @@ -1272,9 +1325,23 @@ def create_predictor(
predictor_args.temperature = 1.0

tensor_parallel_rank, tensor_parallel_degree = llm_utils.init_dist_env()
if not predictor_args.inference_model:
tokenizer.padding_side = "left"

model = None

# model loading
if predictor_args.inference_model:
model = AutoInferenceModelForCausalLM.from_pretrained(
predictor_args.model_name_or_path,
config=config,
predictor_args=predictor_args,
model_args=model_args,
dtype=predictor_args.dtype,
tensor_parallel_degree=tensor_parallel_degree,
tensor_parallel_rank=tensor_parallel_rank,
)
else:
if predictor_args.mode == "dynamic":
# model import (gpt-3,ernie) or AutoModel
if model_args.model_type == "gpt-3":
sys.path.append("./gpt-3")
from modeling import GPTForCausalLM
Expand Down Expand Up @@ -1309,47 +1376,7 @@ def create_predictor(
tensor_parallel_output=False,
)

predictor = DygraphPredictor(predictor_args, model=model, tokenizer=tokenizer)
elif predictor_args.mode == "static":
predictor = StaticGraphPredictor(predictor_args, tokenizer=tokenizer)
else:
raise ValueError("the `mode` should be one of [dynamic, static]")
else:
if predictor_args.mode == "dynamic":
model = AutoInferenceModelForCausalLM.from_pretrained(
predictor_args.model_name_or_path,
config=config,
predictor_args=predictor_args,
model_args=model_args,
dtype=predictor_args.dtype,
tensor_parallel_degree=tensor_parallel_degree,
tensor_parallel_rank=tensor_parallel_rank,
)
model.eval()
if predictor_args.block_attn:
predictor = DygraphBlockInferencePredictor(predictor_args, model=model, tokenizer=tokenizer)
else:
predictor = DygraphInferencePredictor(predictor_args, model=model, tokenizer=tokenizer)

elif predictor_args.mode == "static":
model = AutoInferenceModelForCausalLM.from_pretrained(
predictor_args.model_name_or_path,
config=config,
predictor_args=predictor_args,
model_args=model_args,
dtype=predictor_args.dtype,
tensor_parallel_degree=tensor_parallel_degree,
tensor_parallel_rank=tensor_parallel_rank,
)
cache_kvs_shape = model.get_cache_kvs_shape(
config, predictor_args.batch_size, predictor_args.total_max_length
)
if predictor_args.block_attn:
predictor = StaticBlockInferencePredictor(predictor_args, cache_kvs_shape, tokenizer=tokenizer)
else:
predictor = StaticInferencePredictor(predictor_args, cache_kvs_shape, tokenizer=tokenizer)
else:
raise ValueError("the `mode` should be one of [dynamic, static]")
predictor = AutoPredictor.create_predictor(predictor_args, config, model_args, tokenizer, model=model)

return predictor

Expand Down
4 changes: 3 additions & 1 deletion paddlenlp/transformers/auto/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -858,7 +858,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
)

if predictor_args.mode == "dynamic":
return model_class.from_pretrained(predictor_args.model_name_or_path, config=config, dtype=dtype)
model = model_class.from_pretrained(predictor_args.model_name_or_path, config=config, dtype=dtype)
model.eval()
return model

return model_class

Expand Down

0 comments on commit 2c1387f

Please sign in to comment.