From 0653fd4b1aec24d50a2b9a75b71f9dbea9d833d9 Mon Sep 17 00:00:00 2001 From: Sid Rajaram Date: Sat, 24 Jun 2023 17:20:48 -0700 Subject: [PATCH 1/5] fixed arg parser link --- docs/internals.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/internals.md b/docs/internals.md index a82003ce9db..b27c00fba79 100644 --- a/docs/internals.md +++ b/docs/internals.md @@ -38,7 +38,7 @@ And backend is the Python code (most Pytorch specific stuff) ### Backend (Python) -https://github.com/pytorch/serve/blob/master/ts/arg_parser.py#L64 +https://github.com/pytorch/serve/blob/master/ts/arg_parser.py * Arg parser controls config/not workflow and can also setup a model service worker with a custom socket From 0f581e018cb9149ca5a3034143145693e6533c75 Mon Sep 17 00:00:00 2001 From: Sid Rajaram Date: Sun, 25 Jun 2023 15:23:58 -0700 Subject: [PATCH 2/5] handler for instruction embedding models --- examples/instruction_embedding/README.md | 87 +++++++++++++++++++ .../instructor-embedding-handler.py | 26 ++++++ .../instruction_embedding/requirements.txt | 1 + 3 files changed, 114 insertions(+) create mode 100644 examples/instruction_embedding/README.md create mode 100644 examples/instruction_embedding/instructor-embedding-handler.py create mode 100644 examples/instruction_embedding/requirements.txt diff --git a/examples/instruction_embedding/README.md b/examples/instruction_embedding/README.md new file mode 100644 index 00000000000..5bcfc188774 --- /dev/null +++ b/examples/instruction_embedding/README.md @@ -0,0 +1,87 @@ +# A TorchServe handler for Instructor Embedding models + +A simple handler that you can use to serve [Instructor Embedding models](https://instructor-embedding.github.io/) with TorchServe, supporting both single inference and batch inference. + +## Setup: + +**1.** [Download an Instructor model (i.e. Instructor-XL)](https://huggingface.co/hkunlp/instructor-xl/tree/main?clone=true) from HuggingFace into your model store directory of choosing. Copy the `instructor-embedding-handler.py` into the same directory as your newly downloaded directory containing all the model-related files. + +**2.** Create the .MAR Model Archive using [`torch-model-archiver`](https://github.com/pytorch/serve/blob/master/model-archiver/README.md): + +```bash +torch-model-archiver --model-name --version 1.0 --handler PATH/TO/instructor-embedding-handler.py --extra-files --serialized-file /pytorch_model.bin --f +``` + +**3.** Use [TorchServe](https://pytorch.org/serve/server.html) to startup the server and deploy the Instruction Embedding model you downloaded. + +**Note:** Instructor Embedding models are around ~4 GB. By default, torchserve will autoscale workers (each with a loaded copy of the model). [At present](https://github.com/pytorch/serve/issues/2432), if you have memory concerns, you have to make use of the [Management API](https://pytorch.org/serve/management_api.html) to bring up the server and deploy your model. + + +## Performing Inference +To perform inference for an instruction and corresponding sentence, use the following format for the request body: +```json +{ + "inputs": [INSTRUCTION, SENTENCE] +} +``` + +To perform batch inference, use the following format for the request body: +```json +{ + "inputs": [ + [INSTRUCTION_1, SENTENCE_1], + [INSTRUCTION_2, SENTENCE_2], + ... + ] +} +``` + +## Example: Single Inference +Request Endpoint: /predictions/ + +Request Body: +```json +{ + "inputs": ["Represent the Science title:", "3D ActionSLAM: wearable person tracking in multi-floor environments"] +} +``` + +### Response: +```json +[ + 0.010738617740571499, + ... + 0.10961631685495377 +] +``` + +## Example: Batch Inference +Request Endpoint: /predictions/ + +Request Body: +```json +{ + "inputs": [ + ["Represent the Science title:", "3D ActionSLAM: wearable person tracking in multi-floor environments"], + ["Represent the Medicine sentence for retrieving a duplicate sentence:", "Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear."] + ] +} +``` + +### Response: +```json +[ + [ + 0.010738617740571499, + ... + 0.10961631685495377 + ], + [ + 0.014582153409719467, + ... + 0.08006688207387924 + ] +] +``` + +**Note:** The above request example was for batch inference on 2 distinct instruction/sentence pairs. \ No newline at end of file diff --git a/examples/instruction_embedding/instructor-embedding-handler.py b/examples/instruction_embedding/instructor-embedding-handler.py new file mode 100644 index 00000000000..c28529177e6 --- /dev/null +++ b/examples/instruction_embedding/instructor-embedding-handler.py @@ -0,0 +1,26 @@ +from InstructorEmbedding import INSTRUCTOR +from ts.torch_handler.base_handler import BaseHandler +import logging + +logger = logging.getLogger(__name__) + +class InstructorEmbeddingHandler(BaseHandler): + def __init__(self): + super().__init__() + self.initialized = False + self.model = None + + def initialize(self, context): + properties = context.system_properties + logger.info("Initializing Instructor Embedding model...") + model_dir = properties.get("model_dir") + self.model = INSTRUCTOR(model_dir) + self.initialized = True + + def handle(self, data, context): + inputs = data[0].get("body").get("inputs") + if type(inputs[0]) == str: + # single inference + inputs = [inputs] + pred_embeddings = self.model.encode(inputs) + return [pred_embeddings.tolist()] diff --git a/examples/instruction_embedding/requirements.txt b/examples/instruction_embedding/requirements.txt new file mode 100644 index 00000000000..3205c675e9e --- /dev/null +++ b/examples/instruction_embedding/requirements.txt @@ -0,0 +1 @@ +InstructorEmbedding \ No newline at end of file From bdf4ebc6991d83ce7b368036255a571470de19eb Mon Sep 17 00:00:00 2001 From: Sid Rajaram Date: Mon, 26 Jun 2023 14:38:49 -0700 Subject: [PATCH 3/5] fixed some formatting, pylint fixes --- examples/instruction_embedding/README.md | 18 +++++++++--------- ...dler.py => instructor_embedding_handler.py} | 15 +++++++++++++-- 2 files changed, 22 insertions(+), 11 deletions(-) rename examples/instruction_embedding/{instructor-embedding-handler.py => instructor_embedding_handler.py} (74%) diff --git a/examples/instruction_embedding/README.md b/examples/instruction_embedding/README.md index 5bcfc188774..74865683d59 100644 --- a/examples/instruction_embedding/README.md +++ b/examples/instruction_embedding/README.md @@ -12,21 +12,21 @@ A simple handler that you can use to serve [Instructor Embedding models](https:/ torch-model-archiver --model-name --version 1.0 --handler PATH/TO/instructor-embedding-handler.py --extra-files --serialized-file /pytorch_model.bin --f ``` -**3.** Use [TorchServe](https://pytorch.org/serve/server.html) to startup the server and deploy the Instruction Embedding model you downloaded. +**3.** Use [TorchServe](https://pytorch.org/serve/server.html) to startup the server and deploy the Instruction Embedding model you downloaded. **Note:** Instructor Embedding models are around ~4 GB. By default, torchserve will autoscale workers (each with a loaded copy of the model). [At present](https://github.com/pytorch/serve/issues/2432), if you have memory concerns, you have to make use of the [Management API](https://pytorch.org/serve/management_api.html) to bring up the server and deploy your model. ## Performing Inference -To perform inference for an instruction and corresponding sentence, use the following format for the request body: -```json +To perform inference for an instruction and corresponding sentence, use the following format for the request body: +```text { "inputs": [INSTRUCTION, SENTENCE] } ``` To perform batch inference, use the following format for the request body: -```json +```text { "inputs": [ [INSTRUCTION_1, SENTENCE_1], @@ -36,7 +36,7 @@ To perform batch inference, use the following format for the request body: } ``` -## Example: Single Inference +## Example: Single Inference Request Endpoint: /predictions/ Request Body: @@ -47,7 +47,7 @@ Request Body: ``` ### Response: -```json +```yaml [ 0.010738617740571499, ... @@ -55,7 +55,7 @@ Request Body: ] ``` -## Example: Batch Inference +## Example: Batch Inference Request Endpoint: /predictions/ Request Body: @@ -69,7 +69,7 @@ Request Body: ``` ### Response: -```json +```yaml [ [ 0.010738617740571499, @@ -84,4 +84,4 @@ Request Body: ] ``` -**Note:** The above request example was for batch inference on 2 distinct instruction/sentence pairs. \ No newline at end of file +**Note:** The above request example was for batch inference on 2 distinct instruction/sentence pairs. diff --git a/examples/instruction_embedding/instructor-embedding-handler.py b/examples/instruction_embedding/instructor_embedding_handler.py similarity index 74% rename from examples/instruction_embedding/instructor-embedding-handler.py rename to examples/instruction_embedding/instructor_embedding_handler.py index c28529177e6..2b29fc19ebd 100644 --- a/examples/instruction_embedding/instructor-embedding-handler.py +++ b/examples/instruction_embedding/instructor_embedding_handler.py @@ -1,10 +1,21 @@ +""" +Handler class for Instruction Embedding models (https://instructor-embedding.github.io/) +""" +import logging + from InstructorEmbedding import INSTRUCTOR + from ts.torch_handler.base_handler import BaseHandler -import logging logger = logging.getLogger(__name__) + class InstructorEmbeddingHandler(BaseHandler): + """ + Handler class for Instruction Embedding models. + Refer to the README for how to use Instructor models and this handler. + """ + def __init__(self): super().__init__() self.initialized = False @@ -19,7 +30,7 @@ def initialize(self, context): def handle(self, data, context): inputs = data[0].get("body").get("inputs") - if type(inputs[0]) == str: + if isinstance(inputs[0], str): # single inference inputs = [inputs] pred_embeddings = self.model.encode(inputs) From 1e48eefc72c4dce9bb4cdd8ea0c40273a10dc1af Mon Sep 17 00:00:00 2001 From: Sid Rajaram Date: Mon, 26 Jun 2023 16:58:18 -0700 Subject: [PATCH 4/5] explain output and what to do with it --- examples/instruction_embedding/README.md | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/examples/instruction_embedding/README.md b/examples/instruction_embedding/README.md index 74865683d59..0304e111615 100644 --- a/examples/instruction_embedding/README.md +++ b/examples/instruction_embedding/README.md @@ -2,7 +2,7 @@ A simple handler that you can use to serve [Instructor Embedding models](https://instructor-embedding.github.io/) with TorchServe, supporting both single inference and batch inference. -## Setup: +# Setup: **1.** [Download an Instructor model (i.e. Instructor-XL)](https://huggingface.co/hkunlp/instructor-xl/tree/main?clone=true) from HuggingFace into your model store directory of choosing. Copy the `instructor-embedding-handler.py` into the same directory as your newly downloaded directory containing all the model-related files. @@ -17,7 +17,7 @@ torch-model-archiver --model-name --version 1.0 -- **Note:** Instructor Embedding models are around ~4 GB. By default, torchserve will autoscale workers (each with a loaded copy of the model). [At present](https://github.com/pytorch/serve/issues/2432), if you have memory concerns, you have to make use of the [Management API](https://pytorch.org/serve/management_api.html) to bring up the server and deploy your model. -## Performing Inference +# Performing Inference To perform inference for an instruction and corresponding sentence, use the following format for the request body: ```text { @@ -84,4 +84,16 @@ Request Body: ] ``` -**Note:** The above request example was for batch inference on 2 distinct instruction/sentence pairs. +**Note:** The above request example was for batch inference on 2 distinct instruction/sentence pairs. The output of the batch inference request is two embedding vectors corresponding to the two input pairs (instruction, sentence): + +**The first input was:** +["Represent the Science title:", "3D ActionSLAM: wearable person tracking in multi-floor environments"] + +**The second input was:** +["Represent the Medicine sentence for retrieving a duplicate sentence:", "Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear."] + +The response was a list of 2 embedding vectors (numpy arrays converted .tolist() to ensure they were JSON serializable) corresponding to each of those inputs. The output vectors are quite long, so ellipses were used to make it more readable. + +# Then What? + +**Despite being slightly different under the hood compared to more traditional embedding models (i.e. Sentence Transformers), instruction embeddings can be used just like any other embeddings.** They are still just vector representations of your input text. The only difference is that the embedding vectors are *more fine-tuned* to the downstream task described by the instruction. To that end, these outputted embedding vectors can be stored or looked up in a vector database for [use cases](https://www.pinecone.io/learn/vector-embeddings-for-developers/#what-can-i-do-with-vector-embeddings) like semantic search or question answering or long-term memory for large language models. Check out the [Instructor Embedding project page](https://instructor-embedding.github.io/) for more information. From b4f3321f6a4b0d78af4729f6f9aa2cce6de536ad Mon Sep 17 00:00:00 2001 From: Sid Rajaram Date: Mon, 26 Jun 2023 23:28:36 -0700 Subject: [PATCH 5/5] spellcheck, formatting --- examples/instruction_embedding/README.md | 2 +- ts_scripts/spellcheck_conf/wordlist.txt | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/instruction_embedding/README.md b/examples/instruction_embedding/README.md index 0304e111615..efb2041aa09 100644 --- a/examples/instruction_embedding/README.md +++ b/examples/instruction_embedding/README.md @@ -92,7 +92,7 @@ Request Body: **The second input was:** ["Represent the Medicine sentence for retrieving a duplicate sentence:", "Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear."] -The response was a list of 2 embedding vectors (numpy arrays converted .tolist() to ensure they were JSON serializable) corresponding to each of those inputs. The output vectors are quite long, so ellipses were used to make it more readable. +The response was a list of 2 embedding vectors (numpy arrays converted `.tolist()` to ensure they were JSON serializable) corresponding to each of those inputs. The output vectors are quite long, so ellipses were used to make it more readable. # Then What? diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt index db5a3805432..76141f1e831 100644 --- a/ts_scripts/spellcheck_conf/wordlist.txt +++ b/ts_scripts/spellcheck_conf/wordlist.txt @@ -1060,3 +1060,5 @@ AMI DLAMI XLA inferentia +ActionSLAM +statins