new grpc server and client for tuning inference framework, dtype, inp…

…ut length and batch size on LLMs from huggingface Signed-off-by: ZHANGWENTAI <2092913428@qq.com>
kubedl-io · Oct 31, 2024 · 26ecbf8 · 26ecbf8
1 parent 9fb1a10
commit 26ecbf8
Show file tree

Hide file tree

Showing 30 changed files with 1,526 additions and 8 deletions.
diff --git a/.gitignore b/.gitignore
@@ -29,3 +29,9 @@ vendor
 
 # arch
 arch
+
+# model file cache
+.kubedl_model_cache/
+
+# vllm wheel file
+*.whl
diff --git a/README.md b/README.md
@@ -168,6 +168,13 @@ We use grid search for configuration sampling.
 kubectl -n morphling-system apply -f https://raw.githubusercontent.com/alibaba/morphling/main/examples/experiment/experiment-mobilenet-grid.yaml
 ```
 
+To start multi-framework tunining experiment:
+```bash
+kubectl -n morphling-system apply -f examples/experiment/experiment-grid.yaml
+```
+
+You can specify the model name in this file `examples/experiment/experiment-grid.yaml`. Noted that under the setting of `INFERENCE_FRAMEWORK=vllm` and `DTYPE=int8`, the bitsandbytes only support LLMs with LLAMA architecture (LlamaForCausalLM). So far we only support tuning between float16/bfloat16 and int8 data types. Make sure there are enough resources for LLM serving.
+
 #### Monitor the status of the configuration tuning experiment
 ```bash
 kubectl get -n morphling-system pe
@@ -216,6 +223,12 @@ make test
 ```bash
 make manifests
 ```
+#### Build Multi inference framework Docker Image
+
+Download the right version of vllm .whl file to `pkg/server` directory ([the guidance to download](https://docs.vllm.ai/en/latest/getting_started/installation.html#install-released-versions)) before building the image. 
+For example, if the CUDA version is 11.8 and want to download vllm with version 0.6.1.post1, then download `vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl` to `pkg/server` directory. Noeted that the python version in this image is 3.10.
+Then modify the arguments `CUDA_VERSION` and `VLLM_FILE` in `script/docker_build.sh`, and building the image.
+
 #### Build the component docker images, e.g., Morphling controller, DB-Manager
 
 ```bash

diff --git a/api/v1alpha1/grpc_proto/grpc_predict/predict.proto b/api/v1alpha1/grpc_proto/grpc_predict/predict.proto
@@ -0,0 +1,23 @@
+syntax = "proto3";
+
+package api.predict;
+
+option go_package = "../grpc_predict/go";
+
+// Define prediction service
+service Predictor {
+  // Perform model inference
+  rpc Predict(PredictRequest) returns (PredictResponse);
+}
+
+// Prediction request
+message PredictRequest {
+  bytes input_data = 1;   // Input data, can be serialized tensor or other formats
+  map<string, string> metadata = 2;  // Additional metadata
+}
+
+// Prediction response
+message PredictResponse {
+  bytes output_data = 1;  // Output data, can be serialized tensor or other formats
+  map<string, string> metadata = 2;  // Additional metadata
+}
diff --git a/api/v1alpha1/grpc_proto/grpc_predict/proto.sh b/api/v1alpha1/grpc_proto/grpc_predict/proto.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# Set the proto file name
+export PROTO_FILE=predict.proto
+
+# Generate Go code
+protoc --go_out=. "$PROTO_FILE"
+
+# Generate Python code
+python3 -m grpc_tools.protoc -I. --python_out=python3 --grpc_python_out=python3 "$PROTO_FILE" 
+
+# Output completion information
+echo "gRPC code generation completed for $PROTO_FILE"
diff --git a/api/v1alpha1/grpc_proto/grpc_predict/python3/predict_pb2.py b/api/v1alpha1/grpc_proto/grpc_predict/python3/predict_pb2.py
diff --git a/api/v1alpha1/grpc_proto/grpc_predict/python3/predict_pb2_grpc.py b/api/v1alpha1/grpc_proto/grpc_predict/python3/predict_pb2_grpc.py
@@ -0,0 +1,101 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc
+import warnings
+
+import predict_pb2 as predict__pb2
+
+GRPC_GENERATED_VERSION = '1.66.2'
+GRPC_VERSION = grpc.__version__
+_version_not_supported = False
+
+try:
+    from grpc._utilities import first_version_is_lower
+    _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
+except ImportError:
+    _version_not_supported = True
+
+if _version_not_supported:
+    raise RuntimeError(
+        f'The grpc package installed is at version {GRPC_VERSION},'
+        + f' but the generated code in predict_pb2_grpc.py depends on'
+        + f' grpcio>={GRPC_GENERATED_VERSION}.'
+        + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
+        + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
+    )
+
+
+class PredictorStub(object):
+    """Define prediction service
+    """
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.Predict = channel.unary_unary(
+                '/api.predict.Predictor/Predict',
+                request_serializer=predict__pb2.PredictRequest.SerializeToString,
+                response_deserializer=predict__pb2.PredictResponse.FromString,
+                _registered_method=True)
+
+
+class PredictorServicer(object):
+    """Define prediction service
+    """
+
+    def Predict(self, request, context):
+        """Perform model inference
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_PredictorServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'Predict': grpc.unary_unary_rpc_method_handler(
+                    servicer.Predict,
+                    request_deserializer=predict__pb2.PredictRequest.FromString,
+                    response_serializer=predict__pb2.PredictResponse.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'api.predict.Predictor', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+    server.add_registered_method_handlers('api.predict.Predictor', rpc_method_handlers)
+
+
+ # This class is part of an EXPERIMENTAL API.
+class Predictor(object):
+    """Define prediction service
+    """
+
+    @staticmethod
+    def Predict(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
+            target,
+            '/api.predict.Predictor/Predict',
+            predict__pb2.PredictRequest.SerializeToString,
+            predict__pb2.PredictResponse.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
diff --git a/api/v1alpha1/grpc_proto/grpc_storage_v2/api.proto b/api/v1alpha1/grpc_proto/grpc_storage_v2/api.proto
@@ -0,0 +1,36 @@
+syntax = "proto3";
+package api.storage;
+option go_package = "../grpc_storage/go";
+
+service DB {
+  rpc SaveResult(SaveResultRequest) returns (SaveResultReply);
+  rpc GetResult(GetResultRequest) returns (GetResultReply);
+}
+
+message KeyValue {
+  string key = 1;
+  string value = 2;
+}
+
+message SaveResultReply {
+}
+
+message SaveResultRequest {
+  string namespace = 1;
+  string trial_name = 2;
+//  string experiment_name = 3;
+  repeated KeyValue results = 4;
+}
+
+message GetResultRequest {
+  string namespace = 1;
+  string trial_name = 2;
+//  string experiment_name = 3;
+}
+
+message GetResultReply {
+  string namespace = 1;
+  string trial_name = 2;
+//  string experiment_name = 3;
+  repeated KeyValue results = 4;
+}
diff --git a/api/v1alpha1/grpc_proto/grpc_storage_v2/proto.sh b/api/v1alpha1/grpc_proto/grpc_storage_v2/proto.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+export PROTO_FILE=api.proto
+protoc --go_out=plugins=grpc:./ "$PROTO_FILE"
+python3 -m grpc_tools.protoc -I. --python_out=python3 --grpc_python_out=python3 "$PROTO_FILE"
diff --git a/api/v1alpha1/grpc_proto/grpc_storage_v2/python3/api_pb2.py b/api/v1alpha1/grpc_proto/grpc_storage_v2/python3/api_pb2.py