pytorch · mreso · Dec 14, 2023 · Nov 6, 2023 · Nov 6, 2023 · Nov 6, 2023
diff --git a/docs/grpc_api.md b/docs/grpc_api.md
@@ -87,8 +87,12 @@ service InferenceAPIsService {
 }
 ```
 Backend handler calls "send_intermediate_predict_response" to send one intermediate result to frontend, and return the last result as the existing style. For example
-```
-from ts.protocol.otf_message_handler import send_intermediate_predict_response
+```python
+from ts.handler_utils.utils import send_intermediate_predict_response
+''' Note: TorchServe v1.0.0 will deprecate
+"from ts.protocol.otf_message_handler import send_intermediate_predict_response".
+Please replace it with "from ts.handler_utils.utils import send_intermediate_predict_response".
+'''
 
 def handle(data, context):
     if type(data) is list:

diff --git a/docs/inference_api.md b/docs/inference_api.md
@@ -102,8 +102,12 @@ The result is JSON that tells you that the image is most likely a tabby cat. The
 ```
 * Streaming response via HTTP 1.1 chunked encoding
 TorchServe the inference API support streaming response to allow a sequence of inference responses to be sent over HTTP 1.1 chunked encoding. This new feature is only recommended for use case when the inference latency of the full response is high and the inference intermediate results are sent to client. An example could be LLMs for generative applications, where generating "n" number of tokens can have high latency, in this case user can receive each generated token once ready until the full response completes. To achieve streaming response, backend handler calls "send_intermediate_predict_response" to send one intermediate result to frontend, and return the last result as the existing style. For example,
-```
-from ts.protocol.otf_message_handler import send_intermediate_predict_response
+```python
+from ts.handler_utils.utils import send_intermediate_predict_response
+''' Note: TorchServe v1.0.0 will deprecate
+"from ts.protocol.otf_message_handler import send_intermediate_predict_response".
+Please replace it with "from ts.handler_utils.utils import send_intermediate_predict_response".
+'''
 def handle(data, context):
     if type(data) is list:
         for i in range (3):

diff --git a/docs/large_model_inference.md b/docs/large_model_inference.md
@@ -238,7 +238,11 @@ In this example, a model has 2 workers with job queue size 2. An inference reque
 TorchServe's inference API supports streaming response to allow a sequence of inference responses to be sent over HTTP 1.1 chunked encoding. This feature is only recommended for the use case when the inference latency of the full response is high and the inference intermediate results are sent to the client. An example could be LLMs for generative applications, where generating "n" number of tokens can have high latency. In this case,  the user can receive each generated token once ready until the full response completes. To achieve streaming response, the backend handler calls "send_intermediate_predict_response" to send one intermediate result to the frontend, and returns the last result as the existing style. For example,
 
 ```python
-from ts.protocol.otf_message_handler import send_intermediate_predict_response
+from ts.handler_utils.utils import send_intermediate_predict_response
+''' Note: TorchServe v1.0.0 will deprecate
+"from ts.protocol.otf_message_handler import send_intermediate_predict_response".
+Please replace it with "from ts.handler_utils.utils import send_intermediate_predict_response".
+'''
 def handle(data, context):
     if type(data) is list:
         for i in range (3):
@@ -284,7 +288,12 @@ service InferenceAPIsService {
 ```
 Backend handler calls "send_intermediate_predict_response" to send one intermediate result to frontend, and return the last result as the existing style. For example
 ```python
-from ts.protocol.otf_message_handler import send_intermediate_predict_response
+from ts.handler_utils.utils import send_intermediate_predict_response
+''' Note: TorchServe v1.0.0 will deprecate
+"from ts.protocol.otf_message_handler import send_intermediate_predict_response".
+Please replace it with "from ts.handler_utils.utils import send_intermediate_predict_response".
+'''
+
 
 def handle(data, context):
     if type(data) is list:

diff --git a/examples/large_models/deepspeed_mii/LLM/DeepSpeed_mii_handler.py b/examples/large_models/deepspeed_mii/LLM/DeepSpeed_mii_handler.py
@@ -0,0 +1,87 @@
+import logging
+import os
+from abc import ABC
+
+import mii
+
+from ts.context import Context
+from ts.torch_handler.base_handler import BaseHandler
+
+logger = logging.getLogger(__name__)
+logger.info("DeepSpeed MII version %s", mii.__version__)
+
+
+class DeepSpeedMIIHandler(BaseHandler, ABC):
+    """
+    Diffusers handler class for text to image generation.
+    """
+
+    def __init__(self):
+        self.device = int(os.getenv("LOCAL_RANK", 0))
+        self.initialized = False
+
+    def initialize(self, ctx: Context):
+        """In this initialize function, the Stable Diffusion model is loaded and
+        initialized here.
+        Args:
+            ctx (context): It is a JSON Object containing information
+            pertaining to the model artifacts parameters.
+        """
+        model_dir = ctx.system_properties.get("model_dir")
+        model_name = ctx.model_yaml_config["handler"]["model_name"]
+        model_path = ctx.model_yaml_config["handler"]["model_path"]
+        self.max_new_tokens = int(ctx.model_yaml_config["handler"]["max_new_tokens"])
+
+        model_config = {
+            "tensor_parallel": int(ctx.model_yaml_config["handler"]["tensor_parallel"]),
+            "max_length": int(ctx.model_yaml_config["handler"]["max_length"]),
+        }
+        self.pipe = mii.pipeline(
+            model_name_or_path=model_path,
+            model_config=model_config,
+        )
+        logger.info("Model %s loaded successfully", model_name)
+        self.initialized = True
+
+    def preprocess(self, requests):
+        """Basic text preprocessing, of the user's prompt.
+        Args:
+            requests (str): The Input data in the form of text is passed on to the preprocess
+            function.
+        Returns:
+            list : The preprocess function returns a list of prompts.
+        """
+        inputs = []
+        for _, data in enumerate(requests):
+            input_text = data.get("data")
+            if input_text is None:
+                input_text = data.get("body")
+            if isinstance(input_text, (bytes, bytearray)):
+                input_text = input_text.decode("utf-8")
+            logger.info("Received text: '%s'", input_text)
+            inputs.append(input_text)
+        return inputs
+
+    def inference(self, inputs):
+        """Generates the image relevant to the received text.
+        Args:
+            input_batch (list): List of Text from the pre-process function is passed here
+        Returns:
+            list : It returns a list of the generate images for the input text
+        """
+        inferences = self.pipe(
+            inputs, max_new_tokens=self.max_new_tokens
+        ).generated_texts
+
+        logger.info("Generated text: %s", inferences)
+        return inferences
+
+    def postprocess(self, inference_output):
+        """Post Process Function converts the generated image into Torchserve readable format.
+        Args:
+            inference_output (list): It contains the generated image of the input text.
+        Returns:
+            (list): Returns a list of the images.
+        """
+
+        return inference_output
diff --git a/examples/large_models/deepspeed_mii/LLM/Readme.md b/examples/large_models/deepspeed_mii/LLM/Readme.md
@@ -0,0 +1,5 @@
+# Running LLM model using Microsoft DeepSpeed-MII in Torchserve
+
+This example demo serving HF LLM model with Microsoft DeepSpeed-MII in Torchserve. With DeepSpeed-MII there has been significant progress in system optimizations for DL model inference, drastically reducing both latency and cost.
+
+The notebook example can be found in mii-deepspeed-fastgen.ipynb.
diff --git a/examples/large_models/deepspeed_mii/LLM/mii-deepspeed-fastgen.ipynb b/examples/large_models/deepspeed_mii/LLM/mii-deepspeed-fastgen.ipynb
@@ -0,0 +1,162 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "## Running LLM model using Microsoft DeepSpeed-MII in Torchserve.\n",
+    "This notebook briefs on serving HF LLM model with Microsoft DeepSpeed-MII in Torchserve. With DeepSpeed-MII there has been significant progress in system optimizations for DL model inference, drastically reducing both latency and cost."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "### Step 1: Download model\n",
+    "Login into huggingface hub with token by running the below command"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "huggingface-cli login"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!python ../../utils/Download_model.py --model_name meta-llama/Llama-2-13b-hf"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "### Step 2: Generate model artifacts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2045.86s - pydevd: Sending message related to process being replaced timed-out after 5 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "!torch-model-archiver --model-name mii-llama--Llama-2-13b-hf --version 1.0 --handler DeepSpeed_mii_handler.py --config-file model-config.yaml -r requirements.txt --archive-format no-archive"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!mv model mii-llama--Llama-2-13b-hf\n",
+    "!cd ../../../../ && mkdir model_store && mv mii-llama--Llama-2-13b-hf model_store"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "### Step 3: Start torchserve"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!torchserve --ncs --start --model-store model_store --models mii-llama--Llama-2-13b-hf --ts-config benchmarks/config.properties"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "### Step 4: Run inference\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!curl  \"http://localhost:8080/predictions/mii-Llama-2-13b-hf\" -T examples/large_models/deepspeed_mii/LLM/sample.txt"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/examples/large_models/deepspeed_mii/LLM/model-config.yaml b/examples/large_models/deepspeed_mii/LLM/model-config.yaml
@@ -0,0 +1,20 @@
+# TorchServe frontend parameters
+minWorkers: 1
+maxWorkers: 1
+maxBatchDelay: 100
+responseTimeout: 1200
+parallelType: "tp"
+deviceType: "gpu"
+# example of user specified GPU deviceIds
+deviceIds: [0,1,2,3] # seting CUDA_VISIBLE_DEVICES
+
+torchrun:
+    nproc-per-node: 4
+
+# TorchServe Backend parameters
+handler:
+    model_name: "meta-llama/Llama-2-13b-hf"
+    model_path: "model/models--meta-llama--Llama-2-13b-hf/snapshots/99afe33d7eaa87c7fc6ea2594a0e4e7e588ee0a4"
+    tensor_parallel: 4
+    max_length: 4096
+    max_new_tokens: 256
diff --git a/examples/large_models/deepspeed_mii/LLM/requirements.txt b/examples/large_models/deepspeed_mii/LLM/requirements.txt
@@ -0,0 +1 @@
+deepspeed-mii
diff --git a/examples/large_models/deepspeed_mii/LLM/sample.txt b/examples/large_models/deepspeed_mii/LLM/sample.txt
@@ -0,0 +1 @@
+The museum format went through significant transformations in the 20th century. For a long time, museums collected the art of previous generations. The demonstration of contemporary art required new approaches and fresh ideas. Modernization attempts appeared most often in the design of the outer parts of buildings; museums received attractive exterior decoration, such as the glass pyramids of the Louvre. The museum was supposed to evoke a respectful attitude towards what was stored within its walls. That is why museums were arranged in palaces or in specially built buildings, the appearance of which was supposed to inspire respect. However, it gradually became clear that this approach did not attract modern visitors. It became apparent that contemporary art needed a contemporary place of expression.
diff --git a/examples/large_models/gpt_fast/handler.py b/examples/large_models/gpt_fast/handler.py
@@ -18,7 +18,7 @@
 from sentencepiece import SentencePieceProcessor
 
 from ts.handler_utils.timer import timed
-from ts.protocol.otf_message_handler import send_intermediate_predict_response
+from ts.handler_utils.utils import send_intermediate_predict_response
 from ts.torch_handler.base_handler import BaseHandler
 
 logger = logging.getLogger(__name__)

diff --git a/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerThread.java b/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerThread.java
@@ -227,6 +227,9 @@ public void run() {
                     long begin = System.currentTimeMillis();
                     for (int i = 0; i < repeats; i++) {
                         reply = replies.poll(responseTimeout, TimeUnit.SECONDS);
+                        if (req.getCommand() != WorkerCommands.LOAD) {
+                            break;
+                        }
                     }
 
                     long duration = System.currentTimeMillis() - begin;
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		The museum format went through significant transformations in the 20th century. For a long time, museums collected the art of previous generations. The demonstration of contemporary art required new approaches and fresh ideas. Modernization attempts appeared most often in the design of the outer parts of buildings; museums received attractive exterior decoration, such as the glass pyramids of the Louvre. The museum was supposed to evoke a respectful attitude towards what was stored within its walls. That is why museums were arranged in palaces or in specially built buildings, the appearance of which was supposed to inspire respect. However, it gradually became clear that this approach did not attract modern visitors. It became apparent that contemporary art needed a contemporary place of expression.