Skip to content

Commit

Permalink
new grpc server and client for tuning inference framework, dtype, inp…
Browse files Browse the repository at this point in the history
…ut length and batch size on LLMs from huggingface

Signed-off-by: ZHANGWENTAI <2092913428@qq.com>
  • Loading branch information
ZHANGWENTAI committed Oct 31, 2024
1 parent 9fb1a10 commit 26ecbf8
Show file tree
Hide file tree
Showing 30 changed files with 1,526 additions and 8 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,9 @@ vendor

# arch
arch

# model file cache
.kubedl_model_cache/

# vllm wheel file
*.whl
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,13 @@ We use grid search for configuration sampling.
kubectl -n morphling-system apply -f https://raw.githubusercontent.com/alibaba/morphling/main/examples/experiment/experiment-mobilenet-grid.yaml
```

To start multi-framework tunining experiment:
```bash
kubectl -n morphling-system apply -f examples/experiment/experiment-grid.yaml
```

You can specify the model name in this file `examples/experiment/experiment-grid.yaml`. Noted that under the setting of `INFERENCE_FRAMEWORK=vllm` and `DTYPE=int8`, the bitsandbytes only support LLMs with LLAMA architecture (LlamaForCausalLM). So far we only support tuning between float16/bfloat16 and int8 data types. Make sure there are enough resources for LLM serving.

#### Monitor the status of the configuration tuning experiment
```bash
kubectl get -n morphling-system pe
Expand Down Expand Up @@ -216,6 +223,12 @@ make test
```bash
make manifests
```
#### Build Multi inference framework Docker Image
Download the right version of vllm .whl file to `pkg/server` directory ([the guidance to download](https://docs.vllm.ai/en/latest/getting_started/installation.html#install-released-versions)) before building the image.
For example, if the CUDA version is 11.8 and want to download vllm with version 0.6.1.post1, then download `vllm-0.6.1.post1+cu118-cp310-cp310-manylinux1_x86_64.whl` to `pkg/server` directory. Noeted that the python version in this image is 3.10.
Then modify the arguments `CUDA_VERSION` and `VLLM_FILE` in `script/docker_build.sh`, and building the image.
#### Build the component docker images, e.g., Morphling controller, DB-Manager
```bash
Expand Down
23 changes: 23 additions & 0 deletions api/v1alpha1/grpc_proto/grpc_predict/predict.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
syntax = "proto3";

package api.predict;

option go_package = "../grpc_predict/go";

// Define prediction service
service Predictor {
// Perform model inference
rpc Predict(PredictRequest) returns (PredictResponse);
}

// Prediction request
message PredictRequest {
bytes input_data = 1; // Input data, can be serialized tensor or other formats
map<string, string> metadata = 2; // Additional metadata
}

// Prediction response
message PredictResponse {
bytes output_data = 1; // Output data, can be serialized tensor or other formats
map<string, string> metadata = 2; // Additional metadata
}
13 changes: 13 additions & 0 deletions api/v1alpha1/grpc_proto/grpc_predict/proto.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

# Set the proto file name
export PROTO_FILE=predict.proto

# Generate Go code
protoc --go_out=. "$PROTO_FILE"

# Generate Python code
python3 -m grpc_tools.protoc -I. --python_out=python3 --grpc_python_out=python3 "$PROTO_FILE"

# Output completion information
echo "gRPC code generation completed for $PROTO_FILE"
49 changes: 49 additions & 0 deletions api/v1alpha1/grpc_proto/grpc_predict/python3/predict_pb2.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

101 changes: 101 additions & 0 deletions api/v1alpha1/grpc_proto/grpc_predict/python3/predict_pb2_grpc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
"""Client and server classes corresponding to protobuf-defined services."""
import grpc
import warnings

import predict_pb2 as predict__pb2

GRPC_GENERATED_VERSION = '1.66.2'
GRPC_VERSION = grpc.__version__
_version_not_supported = False

try:
from grpc._utilities import first_version_is_lower
_version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
except ImportError:
_version_not_supported = True

if _version_not_supported:
raise RuntimeError(
f'The grpc package installed is at version {GRPC_VERSION},'
+ f' but the generated code in predict_pb2_grpc.py depends on'
+ f' grpcio>={GRPC_GENERATED_VERSION}.'
+ f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
+ f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
)


class PredictorStub(object):
"""Define prediction service
"""

def __init__(self, channel):
"""Constructor.
Args:
channel: A grpc.Channel.
"""
self.Predict = channel.unary_unary(
'/api.predict.Predictor/Predict',
request_serializer=predict__pb2.PredictRequest.SerializeToString,
response_deserializer=predict__pb2.PredictResponse.FromString,
_registered_method=True)


class PredictorServicer(object):
"""Define prediction service
"""

def Predict(self, request, context):
"""Perform model inference
"""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')


def add_PredictorServicer_to_server(servicer, server):
rpc_method_handlers = {
'Predict': grpc.unary_unary_rpc_method_handler(
servicer.Predict,
request_deserializer=predict__pb2.PredictRequest.FromString,
response_serializer=predict__pb2.PredictResponse.SerializeToString,
),
}
generic_handler = grpc.method_handlers_generic_handler(
'api.predict.Predictor', rpc_method_handlers)
server.add_generic_rpc_handlers((generic_handler,))
server.add_registered_method_handlers('api.predict.Predictor', rpc_method_handlers)


# This class is part of an EXPERIMENTAL API.
class Predictor(object):
"""Define prediction service
"""

@staticmethod
def Predict(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(
request,
target,
'/api.predict.Predictor/Predict',
predict__pb2.PredictRequest.SerializeToString,
predict__pb2.PredictResponse.FromString,
options,
channel_credentials,
insecure,
call_credentials,
compression,
wait_for_ready,
timeout,
metadata,
_registered_method=True)
36 changes: 36 additions & 0 deletions api/v1alpha1/grpc_proto/grpc_storage_v2/api.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
syntax = "proto3";
package api.storage;
option go_package = "../grpc_storage/go";

service DB {
rpc SaveResult(SaveResultRequest) returns (SaveResultReply);
rpc GetResult(GetResultRequest) returns (GetResultReply);
}

message KeyValue {
string key = 1;
string value = 2;
}

message SaveResultReply {
}

message SaveResultRequest {
string namespace = 1;
string trial_name = 2;
// string experiment_name = 3;
repeated KeyValue results = 4;
}

message GetResultRequest {
string namespace = 1;
string trial_name = 2;
// string experiment_name = 3;
}

message GetResultReply {
string namespace = 1;
string trial_name = 2;
// string experiment_name = 3;
repeated KeyValue results = 4;
}
4 changes: 4 additions & 0 deletions api/v1alpha1/grpc_proto/grpc_storage_v2/proto.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash
export PROTO_FILE=api.proto
protoc --go_out=plugins=grpc:./ "$PROTO_FILE"
python3 -m grpc_tools.protoc -I. --python_out=python3 --grpc_python_out=python3 "$PROTO_FILE"
47 changes: 47 additions & 0 deletions api/v1alpha1/grpc_proto/grpc_storage_v2/python3/api_pb2.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 26ecbf8

Please sign in to comment.