feat(backend): add k3d gpu image builder (#797)

* add k3d gpu Dockerfile and nvidia daemonset * add k3d gpu image build/release to release pipeline * add make commands to streamline standing up k3d-gpu uds cluster
defenseunicorns · Jul 23, 2024 · 4504085 · 4504085
1 parent 976635c
commit 4504085
Show file tree

Hide file tree

Showing 7 changed files with 192 additions and 1 deletion.
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -40,6 +40,15 @@ jobs:
         with:
           python-version-file: 'pyproject.toml'
 
+      - name: Build and Publish k3d-gpu image
+        run: |
+          cd packages/k3d-gpu
+          docker build \
+            --platform linux/amd64 \
+            -t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:${{ steps.get_version.outputs.version-without-v }} .
+          docker push ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:${{ steps.get_version.outputs.version-without-v }}
+          cd ../..
+
       - name: Download Python Wheels and Publish Builder Image
         run: |
           docker buildx build --platform amd64,arm64 -t ghcr.io/defenseunicorns/leapfrogai/leapfrogai-sdk:${{ steps.get_version.outputs.version-without-v }} --push -f src/leapfrogai_sdk/Dockerfile .

diff --git a/Makefile b/Makefile
@@ -1,6 +1,7 @@
 ARCH ?= amd64
 KEY ?= ""
 REG_PORT ?= 5000
+REG_NAME ?= registry
 
 VERSION ?= $(shell git describe --abbrev=0 --tags)
 LOCAL_VERSION ?= $(shell git rev-parse --short HEAD)
@@ -33,7 +34,16 @@ gen-python: ## Generate the protobufs for the OpenAI typing within the leapfroga
 			src/leapfrogai_sdk/proto/leapfrogai_sdk/**/*.proto
 
 local-registry: ## Start up a local container registry. Errors in this target are ignored.
-	-docker run -d -p ${REG_PORT}:5000 --restart=always --name registry registry:2
+	@echo "Creating local Docker registry..."
+	-@docker run -d -p ${REG_PORT}:5000 --restart=always --name ${REG_NAME} registry:2
+	@echo "Local registry created at localhost:${REG_PORT}"
+
+
+# Clean up: Stop and remove the local registry
+clean-registry:
+	@echo "Cleaning up..."
+	@docker stop ${REG_NAME}
+	@docker rm ${REG_NAME}
 
 sdk-wheel: ## build wheels for the leapfrogai_sdk package as a dependency for other lfai components
 	docker build --platform=linux/${ARCH} -t ghcr.io/defenseunicorns/leapfrogai/leapfrogai-sdk:${LOCAL_VERSION} -f src/leapfrogai_sdk/Dockerfile .
@@ -151,3 +161,5 @@ build-gpu: build-supabase build-api build-ui build-vllm build-text-embeddings bu
 build-all: build-cpu build-gpu ## Build all of the LFAI packages
 
 include tests/make-tests.mk
+
+include packages/k3d-gpu/Makefile
diff --git a/packages/k3d-gpu/Dockerfile b/packages/k3d-gpu/Dockerfile
@@ -0,0 +1,34 @@
+ARG K3S_TAG=v1.28.8-k3s1
+ARG CUDA_TAG=12.4.1-base-ubuntu22.04
+
+FROM rancher/k3s:$K3S_TAG AS k3s
+FROM nvidia/cuda:$CUDA_TAG
+
+# Install the NVIDIA container toolkit
+RUN apt-get update && apt-get install -y curl \
+    && curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
+    && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
+      sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
+      tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \
+    && apt-get update && apt-get install -y nvidia-container-toolkit-base nvidia-container-toolkit nvidia-container-runtime util-linux \
+    && nvidia-ctk runtime configure --runtime=containerd
+
+COPY --from=k3s / / --exclude=/bin/
+COPY --from=k3s /bin /bin
+
+# Deploy the nvidia driver plugin on startup
+COPY plugin/device-plugin-daemonset.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin-daemonset.yaml
+
+VOLUME /var/lib/kubelet
+VOLUME /var/lib/rancher/k3s
+VOLUME /var/lib/cni
+VOLUME /var/log
+
+# DIFF: resolve fsnotify issues
+RUN sysctl -w fs.inotify.max_user_watches=100000
+RUN sysctl -w fs.inotify.max_user_instances=100000
+
+ENV PATH="$PATH:/bin/aux"
+
+ENTRYPOINT ["/bin/k3s"]
+CMD ["agent"]
diff --git a/packages/k3d-gpu/Makefile b/packages/k3d-gpu/Makefile
@@ -0,0 +1,26 @@
+MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+
+UDS_VERSION := 0.24.1
+LOCAL_VERSION ?= $(shell git rev-parse --short HEAD)
+
+
+build-k3d-gpu:
+	@cd ${MAKEFILE_DIR} && \
+	docker build \
+	--platform linux/amd64 \
+	-t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:${LOCAL_VERSION} .
+
+create-uds-gpu-cluster: build-k3d-gpu
+	@uds deploy k3d-core-slim-dev:${UDS_VERSION} \
+	--set K3D_EXTRA_ARGS="--gpus=all \
+	--image=ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:${LOCAL_VERSION}" --confirm
+
+test-uds-gpu-cluster:
+	@cd ${MAKEFILE_DIR} && \
+	uds zarf tools kubectl apply -f ./test/cuda-vector-add.yaml
+	@uds zarf tools kubectl wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=15s pod -l app=gpu-pod
+	@uds zarf tools kubectl logs -l app=gpu-pod
+	@cd ${MAKEFILE_DIR} && \
+	uds zarf tools kubectl delete -f ./test/cuda-vector-add.yaml
+
+.PHONY: build-k3d-gpu create-uds-gpu-cluster test-uds-gpu-cluster
diff --git a/packages/k3d-gpu/README.md b/packages/k3d-gpu/README.md
@@ -0,0 +1,28 @@
+# K3D GPU
+
+Prepares `k3s` + `nvidia/cuda` base image that enables a K3D cluster to have access to your host machine's NVIDIA, CUDA-capable GPU(s).
+
+## Pre-Requisites
+
+* Docker: https://www.docker.com/
+* K3D: https://k3d.io/
+* UDS-CLI: https://github.com/defenseunicorns/uds-cli
+* Modern NVIDIA GPU with CUDA cores and drivers must be present. Additionally, the CUDA toolkit and NVIDIA container toolkit must be installed.
+
+## Usage
+
+Check out the Make targets for the various options.
+
+### Local
+
+```shell
+make build-k3d-gpu # build the image
+
+make create-uds-gpu-cluster # create a uds cluster equipped with the k3d-gpu image
+
+make test-uds-gpu-cluster # deploy a test gpu pod to see if everything is working
+```
+
+## References
+
+* https://k3d.io/v5.7.2/usage/advanced/cuda/
diff --git a/packages/k3d-gpu/plugin/device-plugin-daemonset.yaml b/packages/k3d-gpu/plugin/device-plugin-daemonset.yaml
@@ -0,0 +1,61 @@
+apiVersion: node.k8s.io/v1
+kind: RuntimeClass
+metadata:
+  name: nvidia
+handler: nvidia
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nvidia-device-plugin-daemonset
+  namespace: kube-system
+spec:
+  selector:
+    matchLabels:
+      name: nvidia-device-plugin-daemonset
+  updateStrategy:
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        name: nvidia-device-plugin-daemonset
+    spec:
+      runtimeClassName: nvidia # Explicitly request the runtime
+      tolerations:
+      - key: nvidia.com/gpu
+        operator: Exists
+        effect: NoSchedule
+      # Mark this pod as a critical add-on; when enabled, the critical add-on
+      # scheduler reserves resources for critical add-on pods so that they can
+      # be rescheduled after a failure.
+      # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
+      priorityClassName: "system-node-critical"
+      containers:
+      - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2
+        name: nvidia-device-plugin-ctr
+        env:
+          - name: PASS_DEVICE_SPECS
+            value: "true"
+          - name: FAIL_ON_INIT_ERROR
+            value: "true"
+          - name: DEVICE_LIST_STRATEGY
+            value: envvar
+          - name: DEVICE_ID_STRATEGY
+            value: uuid
+          - name: NVIDIA_VISIBLE_DEVICES
+            value: all
+          - name: NVIDIA_DRIVER_CAPABILITIES
+            value: all
+          - name: MPS_ROOT
+            value: /run/nvidia/mps
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop: ["ALL"]
+        volumeMounts:
+        - name: device-plugin
+          mountPath: /var/lib/kubelet/device-plugins
+      volumes:
+      - name: device-plugin
+        hostPath:
+          path: /var/lib/kubelet/device-plugins
diff --git a/packages/k3d-gpu/test/cuda-vector-add.yaml b/packages/k3d-gpu/test/cuda-vector-add.yaml
@@ -0,0 +1,21 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: gpu-pod
+  labels:
+    app: gpu-pod
+spec:
+  runtimeClassName: nvidia
+  restartPolicy: Never
+  containers:
+    - name: cuda-container
+      image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2
+      resources:
+        limits:
+          nvidia.com/gpu: "1" # requesting 1 GPU
+          cpu: "1"
+          memory: 0.5Gi
+  tolerations:
+  - key: nvidia.com/gpu
+    operator: Exists
+    effect: NoSchedule