Skip to content

Commit

Permalink
add kserve gpu tests (#3283)
Browse files Browse the repository at this point in the history
* add kserve gpu tests

* add a check to validate gpu mem usage

* fix typos

* fix typo
  • Loading branch information
rohithkrn authored Aug 12, 2024
1 parent b24c72d commit 30eb13d
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 3 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/kserve_cpu_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,4 +42,4 @@ jobs:
ref: v0.12.1
path: kserve
- name: Validate torchserve-kfs and Open Inference Protocol
run: ./kubernetes/kserve/tests/scripts/test_mnist.sh
run: ./kubernetes/kserve/tests/scripts/test_mnist.sh cpu
45 changes: 45 additions & 0 deletions .github/workflows/kserve_gpu_tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
name: KServe GPU Nightly Tests

on:
workflow_dispatch:
# runs everyday at 5:15am
schedule:
- cron: '15 5 * * *'

jobs:
kserve-gpu-tests:
runs-on: [self-hosted, regression-test-gpu]
steps:
- name: Clean up previous run
run: |
echo "Cleaning up previous run"
ls -la ./
sudo rm -rf ./* || true
sudo rm -rf ./.??* || true
ls -la ./
- name: Install minikube and kubectl
run: |
curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64
sudo install minikube-linux-amd64 /usr/local/bin/minikube
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
echo "/usr/local/bin" >> $GITHUB_PATH
- name: Setup Python 3.9
uses: actions/setup-python@v5
with:
python-version: 3.9
architecture: x64
- name: Install grpcurl
run: |
sudo curl -sSL https://github.com/fullstorydev/grpcurl/releases/download/v1.8.0/grpcurl_1.8.0_linux_x86_64.tar.gz | sudo tar -xz -C /usr/local/bin grpcurl
sudo chmod +x /usr/local/bin/grpcurl
- name: Checkout TorchServe
uses: actions/checkout@v3
- name: Checkout kserve repo
uses: actions/checkout@v4
with:
repository: kserve/kserve
ref: v0.12.1
path: kserve
- name: Validate torchserve-kfs and Open Inference Protocol
run: ./kubernetes/kserve/tests/scripts/test_mnist.sh gpu
21 changes: 21 additions & 0 deletions kubernetes/kserve/tests/configs/mnist_v1_gpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
apiVersion: serving.kserve.io/v1beta1
kind: InferenceService
metadata:
name: "torchserve"
spec:
predictor:
pytorch:
storageUri: gs://kfserving-examples/models/torchserve/image_classifier/v1
image: pytorch/torchserve-kfs-nightly:latest-gpu
resources:
limits:
cpu: "1"
memory: 1Gi
nvidia.com/gpu: 1
requests:
cpu: "100m"
memory: 256Mi
nvidia.com/gpu: 1
args:
- --disable-token-auth
- --enable-model-api
22 changes: 22 additions & 0 deletions kubernetes/kserve/tests/configs/mnist_v2_gpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
apiVersion: serving.kserve.io/v1beta1
kind: InferenceService
metadata:
name: "torchserve-mnist-v2"
spec:
predictor:
pytorch:
protocolVersion: v2
storageUri: gs://kfserving-examples/models/torchserve/image_classifier/v2
image: pytorch/torchserve-kfs-nightly:latest-gpu
resources:
limits:
cpu: "1"
memory: 1Gi
nvidia.com/gpu: 1
requests:
cpu: "100m"
memory: 256Mi
nvidia.com/gpu: 1
args:
- --disable-token-auth
- --enable-model-api
48 changes: 46 additions & 2 deletions kubernetes/kserve/tests/scripts/test_mnist.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,36 @@

set -o errexit -o nounset -o pipefail

device=$1

if [ "$device" = "gpu" ]; then
TEST_GPU="true"
else
TEST_GPU="false"
fi

function validate_gpu_memory_usage() {
echo "Validating GPU memory usage..."
memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits)

# Check if any GPU memory usage is above zero
memory_above_zero=false
while IFS= read -r usage; do
if [ "$usage" -gt 0 ]; then
memory_above_zero=true
break
fi
done <<< "$memory_usage"

if [ "$memory_above_zero" = true ]; then
echo "GPU memory usage is greater than 0, proceeding with the tests."
else
echo "✘ GPU memory usage is 0, indicating no GPU activity. Test failed."
delete_minikube_cluster
exit 1
fi
}

function start_minikube_cluster() {
echo "Removing any previous Kubernetes cluster"
minikube delete
Expand Down Expand Up @@ -172,17 +202,31 @@ start_minikube_cluster
install_kserve

echo "MNIST KServe V2 test begin"
deploy_cluster "kubernetes/kserve/tests/configs/mnist_v2_cpu.yaml" "torchserve-mnist-v2-predictor"
if [ "$TEST_GPU" = "true" ]; then
deploy_cluster "kubernetes/kserve/tests/configs/mnist_v2_gpu.yaml" "torchserve-mnist-v2-predictor"
validate_gpu_memory_usage
else
deploy_cluster "kubernetes/kserve/tests/configs/mnist_v2_cpu.yaml" "torchserve-mnist-v2-predictor"
fi
URL="http://${INGRESS_HOST}:${INGRESS_PORT}/v2/models/${MODEL_NAME}/infer"
make_cluster_accessible "torchserve-mnist-v2" ${URL} "./kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_tensor.json" '{"model_name":"mnist","model_version":"1.0","id":"d3b15cad-50a2-4eaf-80ce-8b0a428bd298","parameters":null,"outputs":[{"name":"input-0","shape":[1],"datatype":"INT64","parameters":null,"data":[1]}]}'
kubectl delete inferenceservice torchserve-mnist-v2

echo "MNIST KServe V1 test begin"
deploy_cluster "kubernetes/kserve/tests/configs/mnist_v1_cpu.yaml" "torchserve-predictor"
if [ "$TEST_GPU" = "true" ]; then
deploy_cluster "kubernetes/kserve/tests/configs/mnist_v1_gpu.yaml" "torchserve-predictor"
else
deploy_cluster "kubernetes/kserve/tests/configs/mnist_v1_cpu.yaml" "torchserve-predictor"
fi
URL="http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/${MODEL_NAME}:predict"
make_cluster_accessible "torchserve" ${URL} "./kubernetes/kserve/kf_request_json/v1/mnist.json" '{"predictions":[2]}'
kubectl delete inferenceservice torchserve

if [ "$TEST_GPU" = "true" ]; then
delete_minikube_cluster
exit 0
fi

# OIP HTTP method calls
echo "MNIST Torchserve Open Inference Protocol HTTP"
SERVICE_NAME="torchserve-mnist-v2-http"
Expand Down

0 comments on commit 30eb13d

Please sign in to comment.