From 106997d42e750014de94959c939442ac4e23378b Mon Sep 17 00:00:00 2001 From: Gregory Horvath Date: Fri, 26 Jul 2024 14:37:38 -0400 Subject: [PATCH] feat(backend): nvidia runtimeclass (#787) * adds nvidia runtimeClassName to text-embeddings, vllm, and whisper * adds configuration via Zarf/UDS --- packages/text-embeddings/chart/templates/deployment.yaml | 5 +++++ packages/text-embeddings/embedding-values.yaml | 5 ++++- packages/text-embeddings/zarf.yaml | 4 ++++ packages/vllm/chart/templates/deployment.yaml | 1 + packages/vllm/vllm-values.yaml | 3 +++ packages/whisper/Dockerfile | 2 +- packages/whisper/chart/templates/deployment.yaml | 5 +++++ packages/whisper/whisper-values.yaml | 5 ++++- packages/whisper/zarf.yaml | 4 ++++ uds-bundles/dev/cpu/uds-config.yaml | 2 ++ uds-bundles/dev/gpu/uds-config.yaml | 8 +++++--- uds-bundles/latest/cpu/uds-config.yaml | 3 +++ uds-bundles/latest/gpu/uds-config.yaml | 8 +++++--- 13 files changed, 46 insertions(+), 9 deletions(-) diff --git a/packages/text-embeddings/chart/templates/deployment.yaml b/packages/text-embeddings/chart/templates/deployment.yaml index 8d014f1c7..d06ac87f3 100644 --- a/packages/text-embeddings/chart/templates/deployment.yaml +++ b/packages/text-embeddings/chart/templates/deployment.yaml @@ -25,6 +25,11 @@ spec: labels: {{- include "chart.selectorLabels" . | nindent 8 }} spec: + {{- if gt (index .Values.resources.limits "nvidia.com/gpu") 0.0 }} + runtimeClassName: nvidia + {{- else if .Values.gpu.runtimeClassName }} + runtimeClassName: {{ .Values.gpu.runtimeClassName }} + {{- end }} securityContext: {{- toYaml .Values.podSecurityContext | nindent 8 }} containers: diff --git a/packages/text-embeddings/embedding-values.yaml b/packages/text-embeddings/embedding-values.yaml index 04a1455ff..1f67daea8 100644 --- a/packages/text-embeddings/embedding-values.yaml +++ b/packages/text-embeddings/embedding-values.yaml @@ -1,6 +1,9 @@ image: tag: "###ZARF_CONST_IMAGE_VERSION###" +gpu: + runtimeClassName: "###ZARF_VAR_GPU_CLASS_NAME###" + resources: limits: - nvidia.com/gpu: "###ZARF_VAR_GPU_LIMIT###" + nvidia.com/gpu: ###ZARF_VAR_GPU_LIMIT### diff --git a/packages/text-embeddings/zarf.yaml b/packages/text-embeddings/zarf.yaml index d3bec755c..251e62b87 100644 --- a/packages/text-embeddings/zarf.yaml +++ b/packages/text-embeddings/zarf.yaml @@ -16,6 +16,10 @@ variables: description: The GPU limit for the model inferencing. default: "0" pattern: "^[0-9]+$" + - name: GPU_CLASS_NAME + description: The GPU class name for the model inferencing. Leave blank for CPU-only. + default: "" + pattern: "^(nvidia)?$" components: - name: text-embeddings-model diff --git a/packages/vllm/chart/templates/deployment.yaml b/packages/vllm/chart/templates/deployment.yaml index 8d014f1c7..8aee1170b 100644 --- a/packages/vllm/chart/templates/deployment.yaml +++ b/packages/vllm/chart/templates/deployment.yaml @@ -25,6 +25,7 @@ spec: labels: {{- include "chart.selectorLabels" . | nindent 8 }} spec: + runtimeClassName: {{ .Values.gpu.runtimeClassName }} securityContext: {{- toYaml .Values.podSecurityContext | nindent 8 }} containers: diff --git a/packages/vllm/vllm-values.yaml b/packages/vllm/vllm-values.yaml index 3e3c22379..f79840b9f 100644 --- a/packages/vllm/vllm-values.yaml +++ b/packages/vllm/vllm-values.yaml @@ -1,2 +1,5 @@ image: tag: "###ZARF_CONST_IMAGE_VERSION###" + +gpu: + runtimeClassName: nvidia diff --git a/packages/whisper/Dockerfile b/packages/whisper/Dockerfile index 1686c8d3c..28b107163 100644 --- a/packages/whisper/Dockerfile +++ b/packages/whisper/Dockerfile @@ -26,7 +26,7 @@ RUN pip uninstall -y ctranslate2 transformers[torch] RUN pip install packages/whisper/build/lfai_whisper*.whl --no-index --find-links=packages/whisper/build/ # Use hardened ffmpeg image to get compiled binaries -FROM cgr.dev/chainguard/ffmpeg:latest as ffmpeg +FROM cgr.dev/chainguard/ffmpeg:latest AS ffmpeg # hardened and slim python image FROM ghcr.io/defenseunicorns/leapfrogai/python:3.11 diff --git a/packages/whisper/chart/templates/deployment.yaml b/packages/whisper/chart/templates/deployment.yaml index da60b2778..33eaf2b58 100644 --- a/packages/whisper/chart/templates/deployment.yaml +++ b/packages/whisper/chart/templates/deployment.yaml @@ -25,6 +25,11 @@ spec: labels: {{- include "chart.selectorLabels" . | nindent 8 }} spec: + {{- if gt (index .Values.resources.limits "nvidia.com/gpu") 0.0 }} + runtimeClassName: nvidia + {{- else if .Values.gpu.runtimeClassName }} + runtimeClassName: {{ .Values.gpu.runtimeClassName }} + {{- end }} securityContext: {{- toYaml .Values.podSecurityContext | nindent 8 }} containers: diff --git a/packages/whisper/whisper-values.yaml b/packages/whisper/whisper-values.yaml index 04a1455ff..1f67daea8 100644 --- a/packages/whisper/whisper-values.yaml +++ b/packages/whisper/whisper-values.yaml @@ -1,6 +1,9 @@ image: tag: "###ZARF_CONST_IMAGE_VERSION###" +gpu: + runtimeClassName: "###ZARF_VAR_GPU_CLASS_NAME###" + resources: limits: - nvidia.com/gpu: "###ZARF_VAR_GPU_LIMIT###" + nvidia.com/gpu: ###ZARF_VAR_GPU_LIMIT### diff --git a/packages/whisper/zarf.yaml b/packages/whisper/zarf.yaml index 52a44efc6..f97f8ea76 100644 --- a/packages/whisper/zarf.yaml +++ b/packages/whisper/zarf.yaml @@ -16,6 +16,10 @@ variables: description: The GPU limit for the model inferencing. default: "0" pattern: "^[0-9]+$" + - name: GPU_CLASS_NAME + description: The GPU class name for the model inferencing. Leave blank for CPU-only. + default: "" + pattern: "^(nvidia)?$" components: - name: whisper-model diff --git a/uds-bundles/dev/cpu/uds-config.yaml b/uds-bundles/dev/cpu/uds-config.yaml index d610a1972..85e8e3e74 100644 --- a/uds-bundles/dev/cpu/uds-config.yaml +++ b/uds-bundles/dev/cpu/uds-config.yaml @@ -1,8 +1,10 @@ variables: text-embeddings: + gpu_class_name: "" # Leave blank if nvidia runtimeClass is not present in cluster gpu_limit: 0 whisper: + gpu_class_name: "" # Leave blank if nvidia runtimeClass is not present in cluster gpu_limit: 0 supabase: diff --git a/uds-bundles/dev/gpu/uds-config.yaml b/uds-bundles/dev/gpu/uds-config.yaml index 9ba6bf636..4f331e1dc 100644 --- a/uds-bundles/dev/gpu/uds-config.yaml +++ b/uds-bundles/dev/gpu/uds-config.yaml @@ -1,13 +1,15 @@ # see individual zarf packaging configuration for more variables and variable descriptions variables: text-embeddings: - gpu_limit: 0 + gpu_class_name: "nvidia" # Set to ensure the nvidia runtimeClass is present in case GPU limit is increased + gpu_limit: 0 # runs on CPU until GPU limit is increased whisper: - gpu_limit: 0 + gpu_class_name: "nvidia" # Set to ensure the nvidia runtimeClass is present in case GPU limit is increased + gpu_limit: 0 # runs on CPU until GPU limit is increased vllm: - gpu_limit: 1 + gpu_limit: 1 # if <1, vllm won't work, VLLM is GPU only #tensor_parallel_size: 1 # TODO: reintroduce when vllm changes get pulled in supabase: diff --git a/uds-bundles/latest/cpu/uds-config.yaml b/uds-bundles/latest/cpu/uds-config.yaml index d610a1972..99e4fb49b 100644 --- a/uds-bundles/latest/cpu/uds-config.yaml +++ b/uds-bundles/latest/cpu/uds-config.yaml @@ -1,8 +1,11 @@ +# see individual zarf packaging configuration for more variables and variable descriptions variables: text-embeddings: + gpu_class_name: "" # Leave blank if nvidia runtimeClass is not present in cluster gpu_limit: 0 whisper: + gpu_class_name: "" # Leave blank if nvidia runtimeClass is not present in cluster gpu_limit: 0 supabase: diff --git a/uds-bundles/latest/gpu/uds-config.yaml b/uds-bundles/latest/gpu/uds-config.yaml index 9ba6bf636..4f331e1dc 100644 --- a/uds-bundles/latest/gpu/uds-config.yaml +++ b/uds-bundles/latest/gpu/uds-config.yaml @@ -1,13 +1,15 @@ # see individual zarf packaging configuration for more variables and variable descriptions variables: text-embeddings: - gpu_limit: 0 + gpu_class_name: "nvidia" # Set to ensure the nvidia runtimeClass is present in case GPU limit is increased + gpu_limit: 0 # runs on CPU until GPU limit is increased whisper: - gpu_limit: 0 + gpu_class_name: "nvidia" # Set to ensure the nvidia runtimeClass is present in case GPU limit is increased + gpu_limit: 0 # runs on CPU until GPU limit is increased vllm: - gpu_limit: 1 + gpu_limit: 1 # if <1, vllm won't work, VLLM is GPU only #tensor_parallel_size: 1 # TODO: reintroduce when vllm changes get pulled in supabase: