Skip to content

Commit

Permalink
feat(backend): nvidia runtimeclass (#787)
Browse files Browse the repository at this point in the history
* adds nvidia runtimeClassName to text-embeddings, vllm, and whisper
* adds configuration via Zarf/UDS
  • Loading branch information
gphorvath committed Jul 26, 2024
1 parent fd1e3dd commit 106997d
Show file tree
Hide file tree
Showing 13 changed files with 46 additions and 9 deletions.
5 changes: 5 additions & 0 deletions packages/text-embeddings/chart/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ spec:
labels:
{{- include "chart.selectorLabels" . | nindent 8 }}
spec:
{{- if gt (index .Values.resources.limits "nvidia.com/gpu") 0.0 }}
runtimeClassName: nvidia
{{- else if .Values.gpu.runtimeClassName }}
runtimeClassName: {{ .Values.gpu.runtimeClassName }}
{{- end }}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
containers:
Expand Down
5 changes: 4 additions & 1 deletion packages/text-embeddings/embedding-values.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
image:
tag: "###ZARF_CONST_IMAGE_VERSION###"

gpu:
runtimeClassName: "###ZARF_VAR_GPU_CLASS_NAME###"

resources:
limits:
nvidia.com/gpu: "###ZARF_VAR_GPU_LIMIT###"
nvidia.com/gpu: ###ZARF_VAR_GPU_LIMIT###
4 changes: 4 additions & 0 deletions packages/text-embeddings/zarf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ variables:
description: The GPU limit for the model inferencing.
default: "0"
pattern: "^[0-9]+$"
- name: GPU_CLASS_NAME
description: The GPU class name for the model inferencing. Leave blank for CPU-only.
default: ""
pattern: "^(nvidia)?$"

components:
- name: text-embeddings-model
Expand Down
1 change: 1 addition & 0 deletions packages/vllm/chart/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ spec:
labels:
{{- include "chart.selectorLabels" . | nindent 8 }}
spec:
runtimeClassName: {{ .Values.gpu.runtimeClassName }}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
containers:
Expand Down
3 changes: 3 additions & 0 deletions packages/vllm/vllm-values.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
image:
tag: "###ZARF_CONST_IMAGE_VERSION###"

gpu:
runtimeClassName: nvidia
2 changes: 1 addition & 1 deletion packages/whisper/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ RUN pip uninstall -y ctranslate2 transformers[torch]
RUN pip install packages/whisper/build/lfai_whisper*.whl --no-index --find-links=packages/whisper/build/

# Use hardened ffmpeg image to get compiled binaries
FROM cgr.dev/chainguard/ffmpeg:latest as ffmpeg
FROM cgr.dev/chainguard/ffmpeg:latest AS ffmpeg

# hardened and slim python image
FROM ghcr.io/defenseunicorns/leapfrogai/python:3.11
Expand Down
5 changes: 5 additions & 0 deletions packages/whisper/chart/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ spec:
labels:
{{- include "chart.selectorLabels" . | nindent 8 }}
spec:
{{- if gt (index .Values.resources.limits "nvidia.com/gpu") 0.0 }}
runtimeClassName: nvidia
{{- else if .Values.gpu.runtimeClassName }}
runtimeClassName: {{ .Values.gpu.runtimeClassName }}
{{- end }}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
containers:
Expand Down
5 changes: 4 additions & 1 deletion packages/whisper/whisper-values.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
image:
tag: "###ZARF_CONST_IMAGE_VERSION###"

gpu:
runtimeClassName: "###ZARF_VAR_GPU_CLASS_NAME###"

resources:
limits:
nvidia.com/gpu: "###ZARF_VAR_GPU_LIMIT###"
nvidia.com/gpu: ###ZARF_VAR_GPU_LIMIT###
4 changes: 4 additions & 0 deletions packages/whisper/zarf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ variables:
description: The GPU limit for the model inferencing.
default: "0"
pattern: "^[0-9]+$"
- name: GPU_CLASS_NAME
description: The GPU class name for the model inferencing. Leave blank for CPU-only.
default: ""
pattern: "^(nvidia)?$"

components:
- name: whisper-model
Expand Down
2 changes: 2 additions & 0 deletions uds-bundles/dev/cpu/uds-config.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
variables:
text-embeddings:
gpu_class_name: "" # Leave blank if nvidia runtimeClass is not present in cluster
gpu_limit: 0

whisper:
gpu_class_name: "" # Leave blank if nvidia runtimeClass is not present in cluster
gpu_limit: 0

supabase:
Expand Down
8 changes: 5 additions & 3 deletions uds-bundles/dev/gpu/uds-config.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
# see individual zarf packaging configuration for more variables and variable descriptions
variables:
text-embeddings:
gpu_limit: 0
gpu_class_name: "nvidia" # Set to ensure the nvidia runtimeClass is present in case GPU limit is increased
gpu_limit: 0 # runs on CPU until GPU limit is increased

whisper:
gpu_limit: 0
gpu_class_name: "nvidia" # Set to ensure the nvidia runtimeClass is present in case GPU limit is increased
gpu_limit: 0 # runs on CPU until GPU limit is increased

vllm:
gpu_limit: 1
gpu_limit: 1 # if <1, vllm won't work, VLLM is GPU only
#tensor_parallel_size: 1 # TODO: reintroduce when vllm changes get pulled in

supabase:
Expand Down
3 changes: 3 additions & 0 deletions uds-bundles/latest/cpu/uds-config.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
# see individual zarf packaging configuration for more variables and variable descriptions
variables:
text-embeddings:
gpu_class_name: "" # Leave blank if nvidia runtimeClass is not present in cluster
gpu_limit: 0

whisper:
gpu_class_name: "" # Leave blank if nvidia runtimeClass is not present in cluster
gpu_limit: 0

supabase:
Expand Down
8 changes: 5 additions & 3 deletions uds-bundles/latest/gpu/uds-config.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
# see individual zarf packaging configuration for more variables and variable descriptions
variables:
text-embeddings:
gpu_limit: 0
gpu_class_name: "nvidia" # Set to ensure the nvidia runtimeClass is present in case GPU limit is increased
gpu_limit: 0 # runs on CPU until GPU limit is increased

whisper:
gpu_limit: 0
gpu_class_name: "nvidia" # Set to ensure the nvidia runtimeClass is present in case GPU limit is increased
gpu_limit: 0 # runs on CPU until GPU limit is increased

vllm:
gpu_limit: 1
gpu_limit: 1 # if <1, vllm won't work, VLLM is GPU only
#tensor_parallel_size: 1 # TODO: reintroduce when vllm changes get pulled in

supabase:
Expand Down

0 comments on commit 106997d

Please sign in to comment.