diff --git a/examples/large_models/vllm/mistral/Readme.md b/examples/large_models/vllm/mistral/Readme.md new file mode 100644 index 0000000000..267d57e11d --- /dev/null +++ b/examples/large_models/vllm/mistral/Readme.md @@ -0,0 +1,55 @@ +# Example showing inference with vLLM with mistralai/Mistral-7B-v0.1 model + +This is an example showing how to integrate [vLLM](https://github.com/vllm-project/vllm) with TorchServe and run inference on `mistralai/Mistral-7B-v0.1` model. +vLLM achieves high throughput using PagedAttention. More details can be found [here](https://vllm.ai/) + +### Step 1: Login to HuggingFace + +Login with a HuggingFace account +``` +huggingface-cli login +# or using an environment variable +huggingface-cli login --token $HUGGINGFACE_TOKEN +``` + +```bash +python ../../Huggingface_accelerate/Download_model.py --model_path model --model_name mistralai/Mistral-7B-v0.1 +``` +Model will be saved in the following path, `mistralai/Mistral-7B-v0.1`. + +### Step 2: Generate MAR file + +Add the downloaded path to " model_path:" in `model-config.yaml` and run the following. + +```bash +torch-model-archiver --model-name mistral7b --version 1.0 --handler custom_handler.py --config-file model-config.yaml -r requirements.txt --archive-format tgz +``` + +### Step 3: Add the mar file to model store + +```bash +mkdir model_store +mv mistral7b.tar.gz model_store +``` + +### Step 3: Start torchserve + + +```bash +torchserve --start --ncs --ts-config config.properties --model-store model_store --models mistral7b.tar.gz +``` + +### Step 4: Run inference + +```bash +curl -v "http://localhost:8080/predictions/mistral7b" -T sample_text.txt +``` + +results in the following output +``` +Mayonnaise is made of eggs, oil, vinegar, salt and pepper. Using an electric blender, combine all the ingredients and beat at high speed for 4 to 5 minutes. + +Try it with some mustard and paprika mixed in, and a bit of sweetener if you like. But use real mayonnaise or it isn’t the same. Marlou + +What in the world is mayonnaise? +``` diff --git a/examples/large_models/vllm/mistral/config.properties b/examples/large_models/vllm/mistral/config.properties new file mode 100644 index 0000000000..67f62d182f --- /dev/null +++ b/examples/large_models/vllm/mistral/config.properties @@ -0,0 +1,5 @@ +inference_address=http://127.0.0.1:8080 +management_address=http://127.0.0.1:8081 +metrics_address=http://127.0.0.1:8082 +enable_envvars_config=true +install_py_dep_per_model=true diff --git a/examples/large_models/vllm/mistral/custom_handler.py b/examples/large_models/vllm/mistral/custom_handler.py new file mode 100644 index 0000000000..626b62333a --- /dev/null +++ b/examples/large_models/vllm/mistral/custom_handler.py @@ -0,0 +1,86 @@ +import logging + +import torch +import vllm +from vllm import LLM, SamplingParams + +from ts.context import Context +from ts.torch_handler.base_handler import BaseHandler + +logger = logging.getLogger(__name__) +logger.info("vLLM version %s", vllm.__version__) + + +class CustomHandler(BaseHandler): + """ + Custom Handler for integrating vLLM + """ + + def __init__(self): + super().__init__() + self.max_new_tokens = None + self.tokenizer = None + self.initialized = False + + def initialize(self, ctx: Context): + """In this initialize function, the model is loaded + Args: + ctx (context): It is a JSON Object containing information + pertaining to the model artifacts parameters. + """ + model_dir = ctx.system_properties.get("model_dir") + self.max_new_tokens = int(ctx.model_yaml_config["handler"]["max_new_tokens"]) + model_name = ctx.model_yaml_config["handler"]["model_name"] + model_path = ctx.model_yaml_config["handler"]["model_path"] + tp_size = ctx.model_yaml_config["torchrun"]["nproc-per-node"] + seed = int(ctx.model_yaml_config["handler"]["manual_seed"]) + torch.manual_seed(seed) + + self.model = LLM(model=model_path, tensor_parallel_size=tp_size) + + logger.info("Model %s loaded successfully", ctx.model_name) + self.initialized = True + + def preprocess(self, requests): + """ + Pre-processing of prompts being sent to TorchServe + Args: + requests (list): A list of dictionaries with a "data" or "body" field, each + containing the input text to be processed. + Returns: + tuple: A tuple with two tensors: the batch of input ids and the batch of + attention masks. + """ + input_texts = [data.get("data") or data.get("body") for data in requests] + input_texts = [ + input_text.decode("utf-8") + if isinstance(input_text, (bytes, bytearray)) + else input_text + for input_text in input_texts + ] + return input_texts + + def inference(self, input_batch): + """ + Generates the model response for the given prompt + Args: + input_batch : List of input text prompts as returned by the preprocess function. + Returns: + list: A list of strings with the predicted values for each input text in the batch. + """ + logger.info(f"Input text is {input_batch}") + sampling_params = SamplingParams(max_tokens=self.max_new_tokens) + outputs = self.model.generate(input_batch, sampling_params=sampling_params) + + logger.info("Generated text: %s", outputs) + return outputs + + def postprocess(self, inference_output): + """Post Process Function returns the text response from the vLLM output. + Args: + inference_output (list): It contains the response of vLLM + Returns: + (list): Returns a list of the Predictions and Explanations. + """ + + return [inf_output.outputs[0].text for inf_output in inference_output] diff --git a/examples/large_models/vllm/mistral/model-config.yaml b/examples/large_models/vllm/mistral/model-config.yaml new file mode 100644 index 0000000000..9ec8f6ec46 --- /dev/null +++ b/examples/large_models/vllm/mistral/model-config.yaml @@ -0,0 +1,18 @@ +# TorchServe frontend parameters +minWorkers: 1 +maxWorkers: 1 +batchSize: 2 +maxBatchDelay: 100 +responseTimeout: 1200 +deviceType: "gpu" +# example of user specified GPU deviceIds +deviceIds: [0,1,2,3] # seting CUDA_VISIBLE_DEVICES + +torchrun: + nproc-per-node: 4 + +handler: + model_name: "mistralai/Mistral-7B-v0.1" + model_path: "/home/ubuntu/serve/examples/large_models/vllm/mistral/model/models--mistralai--Mistral-7B-v0.1/snapshots/5e9c98b96d071dce59368012254c55b0ec6f8658" + max_new_tokens: 100 + manual_seed: 40 diff --git a/examples/large_models/vllm/mistral/requirements.txt b/examples/large_models/vllm/mistral/requirements.txt new file mode 100644 index 0000000000..76f11f1540 --- /dev/null +++ b/examples/large_models/vllm/mistral/requirements.txt @@ -0,0 +1 @@ +vllm \ No newline at end of file diff --git a/examples/large_models/vllm/mistral/sample_text.txt b/examples/large_models/vllm/mistral/sample_text.txt new file mode 100644 index 0000000000..edfe9f4c10 --- /dev/null +++ b/examples/large_models/vllm/mistral/sample_text.txt @@ -0,0 +1 @@ +what is the recipe of mayonnaise? \ No newline at end of file diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt index 72f0185079..eefdb4f5e6 100644 --- a/ts_scripts/spellcheck_conf/wordlist.txt +++ b/ts_scripts/spellcheck_conf/wordlist.txt @@ -1137,6 +1137,9 @@ Naver FlashAttention GenAI prem +vLLM +mistralai +PagedAttention CachingMetric DSL SDPA