From 30eb13da1bbb39403369488738740eb66dbf4d2f Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Mon, 12 Aug 2024 15:12:46 -0700 Subject: [PATCH] add kserve gpu tests (#3283) * add kserve gpu tests * add a check to validate gpu mem usage * fix typos * fix typo --- .github/workflows/kserve_cpu_tests.yml | 2 +- .github/workflows/kserve_gpu_tests.yml | 45 +++++++++++++++++ .../kserve/tests/configs/mnist_v1_gpu.yaml | 21 ++++++++ .../kserve/tests/configs/mnist_v2_gpu.yaml | 22 +++++++++ kubernetes/kserve/tests/scripts/test_mnist.sh | 48 ++++++++++++++++++- 5 files changed, 135 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/kserve_gpu_tests.yml create mode 100644 kubernetes/kserve/tests/configs/mnist_v1_gpu.yaml create mode 100644 kubernetes/kserve/tests/configs/mnist_v2_gpu.yaml diff --git a/.github/workflows/kserve_cpu_tests.yml b/.github/workflows/kserve_cpu_tests.yml index cd9ba84b45..0f56b392a6 100644 --- a/.github/workflows/kserve_cpu_tests.yml +++ b/.github/workflows/kserve_cpu_tests.yml @@ -42,4 +42,4 @@ jobs: ref: v0.12.1 path: kserve - name: Validate torchserve-kfs and Open Inference Protocol - run: ./kubernetes/kserve/tests/scripts/test_mnist.sh + run: ./kubernetes/kserve/tests/scripts/test_mnist.sh cpu diff --git a/.github/workflows/kserve_gpu_tests.yml b/.github/workflows/kserve_gpu_tests.yml new file mode 100644 index 0000000000..d11bd938ed --- /dev/null +++ b/.github/workflows/kserve_gpu_tests.yml @@ -0,0 +1,45 @@ +name: KServe GPU Nightly Tests + +on: + workflow_dispatch: + # runs everyday at 5:15am + schedule: + - cron: '15 5 * * *' + +jobs: + kserve-gpu-tests: + runs-on: [self-hosted, regression-test-gpu] + steps: + - name: Clean up previous run + run: | + echo "Cleaning up previous run" + ls -la ./ + sudo rm -rf ./* || true + sudo rm -rf ./.??* || true + ls -la ./ + - name: Install minikube and kubectl + run: | + curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64 + sudo install minikube-linux-amd64 /usr/local/bin/minikube + curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl + echo "/usr/local/bin" >> $GITHUB_PATH + - name: Setup Python 3.9 + uses: actions/setup-python@v5 + with: + python-version: 3.9 + architecture: x64 + - name: Install grpcurl + run: | + sudo curl -sSL https://github.com/fullstorydev/grpcurl/releases/download/v1.8.0/grpcurl_1.8.0_linux_x86_64.tar.gz | sudo tar -xz -C /usr/local/bin grpcurl + sudo chmod +x /usr/local/bin/grpcurl + - name: Checkout TorchServe + uses: actions/checkout@v3 + - name: Checkout kserve repo + uses: actions/checkout@v4 + with: + repository: kserve/kserve + ref: v0.12.1 + path: kserve + - name: Validate torchserve-kfs and Open Inference Protocol + run: ./kubernetes/kserve/tests/scripts/test_mnist.sh gpu diff --git a/kubernetes/kserve/tests/configs/mnist_v1_gpu.yaml b/kubernetes/kserve/tests/configs/mnist_v1_gpu.yaml new file mode 100644 index 0000000000..364ccc2716 --- /dev/null +++ b/kubernetes/kserve/tests/configs/mnist_v1_gpu.yaml @@ -0,0 +1,21 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: "torchserve" +spec: + predictor: + pytorch: + storageUri: gs://kfserving-examples/models/torchserve/image_classifier/v1 + image: pytorch/torchserve-kfs-nightly:latest-gpu + resources: + limits: + cpu: "1" + memory: 1Gi + nvidia.com/gpu: 1 + requests: + cpu: "100m" + memory: 256Mi + nvidia.com/gpu: 1 + args: + - --disable-token-auth + - --enable-model-api diff --git a/kubernetes/kserve/tests/configs/mnist_v2_gpu.yaml b/kubernetes/kserve/tests/configs/mnist_v2_gpu.yaml new file mode 100644 index 0000000000..00703ef833 --- /dev/null +++ b/kubernetes/kserve/tests/configs/mnist_v2_gpu.yaml @@ -0,0 +1,22 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: "torchserve-mnist-v2" +spec: + predictor: + pytorch: + protocolVersion: v2 + storageUri: gs://kfserving-examples/models/torchserve/image_classifier/v2 + image: pytorch/torchserve-kfs-nightly:latest-gpu + resources: + limits: + cpu: "1" + memory: 1Gi + nvidia.com/gpu: 1 + requests: + cpu: "100m" + memory: 256Mi + nvidia.com/gpu: 1 + args: + - --disable-token-auth + - --enable-model-api diff --git a/kubernetes/kserve/tests/scripts/test_mnist.sh b/kubernetes/kserve/tests/scripts/test_mnist.sh index 5d2d7de0f0..5c3532e1e5 100755 --- a/kubernetes/kserve/tests/scripts/test_mnist.sh +++ b/kubernetes/kserve/tests/scripts/test_mnist.sh @@ -2,6 +2,36 @@ set -o errexit -o nounset -o pipefail +device=$1 + +if [ "$device" = "gpu" ]; then + TEST_GPU="true" +else + TEST_GPU="false" +fi + +function validate_gpu_memory_usage() { + echo "Validating GPU memory usage..." + memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits) + + # Check if any GPU memory usage is above zero + memory_above_zero=false + while IFS= read -r usage; do + if [ "$usage" -gt 0 ]; then + memory_above_zero=true + break + fi + done <<< "$memory_usage" + + if [ "$memory_above_zero" = true ]; then + echo "GPU memory usage is greater than 0, proceeding with the tests." + else + echo "✘ GPU memory usage is 0, indicating no GPU activity. Test failed." + delete_minikube_cluster + exit 1 + fi +} + function start_minikube_cluster() { echo "Removing any previous Kubernetes cluster" minikube delete @@ -172,17 +202,31 @@ start_minikube_cluster install_kserve echo "MNIST KServe V2 test begin" -deploy_cluster "kubernetes/kserve/tests/configs/mnist_v2_cpu.yaml" "torchserve-mnist-v2-predictor" +if [ "$TEST_GPU" = "true" ]; then + deploy_cluster "kubernetes/kserve/tests/configs/mnist_v2_gpu.yaml" "torchserve-mnist-v2-predictor" + validate_gpu_memory_usage +else + deploy_cluster "kubernetes/kserve/tests/configs/mnist_v2_cpu.yaml" "torchserve-mnist-v2-predictor" +fi URL="http://${INGRESS_HOST}:${INGRESS_PORT}/v2/models/${MODEL_NAME}/infer" make_cluster_accessible "torchserve-mnist-v2" ${URL} "./kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_tensor.json" '{"model_name":"mnist","model_version":"1.0","id":"d3b15cad-50a2-4eaf-80ce-8b0a428bd298","parameters":null,"outputs":[{"name":"input-0","shape":[1],"datatype":"INT64","parameters":null,"data":[1]}]}' kubectl delete inferenceservice torchserve-mnist-v2 echo "MNIST KServe V1 test begin" -deploy_cluster "kubernetes/kserve/tests/configs/mnist_v1_cpu.yaml" "torchserve-predictor" +if [ "$TEST_GPU" = "true" ]; then + deploy_cluster "kubernetes/kserve/tests/configs/mnist_v1_gpu.yaml" "torchserve-predictor" +else + deploy_cluster "kubernetes/kserve/tests/configs/mnist_v1_cpu.yaml" "torchserve-predictor" +fi URL="http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/${MODEL_NAME}:predict" make_cluster_accessible "torchserve" ${URL} "./kubernetes/kserve/kf_request_json/v1/mnist.json" '{"predictions":[2]}' kubectl delete inferenceservice torchserve +if [ "$TEST_GPU" = "true" ]; then + delete_minikube_cluster + exit 0 +fi + # OIP HTTP method calls echo "MNIST Torchserve Open Inference Protocol HTTP" SERVICE_NAME="torchserve-mnist-v2-http"