Skip to content

Commit

Permalink
feat(backend): add k3d gpu image builder (#797)
Browse files Browse the repository at this point in the history
* add k3d gpu Dockerfile and nvidia daemonset
* add k3d gpu image build/release to release pipeline
* add make commands to streamline standing up k3d-gpu uds cluster
  • Loading branch information
gphorvath committed Jul 23, 2024
1 parent 976635c commit 4504085
Show file tree
Hide file tree
Showing 7 changed files with 192 additions and 1 deletion.
9 changes: 9 additions & 0 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,15 @@ jobs:
with:
python-version-file: 'pyproject.toml'

- name: Build and Publish k3d-gpu image
run: |
cd packages/k3d-gpu
docker build \
--platform linux/amd64 \
-t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:${{ steps.get_version.outputs.version-without-v }} .
docker push ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:${{ steps.get_version.outputs.version-without-v }}
cd ../..
- name: Download Python Wheels and Publish Builder Image
run: |
docker buildx build --platform amd64,arm64 -t ghcr.io/defenseunicorns/leapfrogai/leapfrogai-sdk:${{ steps.get_version.outputs.version-without-v }} --push -f src/leapfrogai_sdk/Dockerfile .
Expand Down
14 changes: 13 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
ARCH ?= amd64
KEY ?= ""
REG_PORT ?= 5000
REG_NAME ?= registry

VERSION ?= $(shell git describe --abbrev=0 --tags)
LOCAL_VERSION ?= $(shell git rev-parse --short HEAD)
Expand Down Expand Up @@ -33,7 +34,16 @@ gen-python: ## Generate the protobufs for the OpenAI typing within the leapfroga
src/leapfrogai_sdk/proto/leapfrogai_sdk/**/*.proto

local-registry: ## Start up a local container registry. Errors in this target are ignored.
-docker run -d -p ${REG_PORT}:5000 --restart=always --name registry registry:2
@echo "Creating local Docker registry..."
-@docker run -d -p ${REG_PORT}:5000 --restart=always --name ${REG_NAME} registry:2
@echo "Local registry created at localhost:${REG_PORT}"


# Clean up: Stop and remove the local registry
clean-registry:
@echo "Cleaning up..."
@docker stop ${REG_NAME}
@docker rm ${REG_NAME}

sdk-wheel: ## build wheels for the leapfrogai_sdk package as a dependency for other lfai components
docker build --platform=linux/${ARCH} -t ghcr.io/defenseunicorns/leapfrogai/leapfrogai-sdk:${LOCAL_VERSION} -f src/leapfrogai_sdk/Dockerfile .
Expand Down Expand Up @@ -151,3 +161,5 @@ build-gpu: build-supabase build-api build-ui build-vllm build-text-embeddings bu
build-all: build-cpu build-gpu ## Build all of the LFAI packages

include tests/make-tests.mk

include packages/k3d-gpu/Makefile
34 changes: 34 additions & 0 deletions packages/k3d-gpu/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
ARG K3S_TAG=v1.28.8-k3s1
ARG CUDA_TAG=12.4.1-base-ubuntu22.04

FROM rancher/k3s:$K3S_TAG AS k3s
FROM nvidia/cuda:$CUDA_TAG

# Install the NVIDIA container toolkit
RUN apt-get update && apt-get install -y curl \
&& curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \
&& apt-get update && apt-get install -y nvidia-container-toolkit-base nvidia-container-toolkit nvidia-container-runtime util-linux \
&& nvidia-ctk runtime configure --runtime=containerd

COPY --from=k3s / / --exclude=/bin/
COPY --from=k3s /bin /bin

# Deploy the nvidia driver plugin on startup
COPY plugin/device-plugin-daemonset.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin-daemonset.yaml

VOLUME /var/lib/kubelet
VOLUME /var/lib/rancher/k3s
VOLUME /var/lib/cni
VOLUME /var/log

# DIFF: resolve fsnotify issues
RUN sysctl -w fs.inotify.max_user_watches=100000
RUN sysctl -w fs.inotify.max_user_instances=100000

ENV PATH="$PATH:/bin/aux"

ENTRYPOINT ["/bin/k3s"]
CMD ["agent"]
26 changes: 26 additions & 0 deletions packages/k3d-gpu/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))

UDS_VERSION := 0.24.1
LOCAL_VERSION ?= $(shell git rev-parse --short HEAD)


build-k3d-gpu:
@cd ${MAKEFILE_DIR} && \
docker build \
--platform linux/amd64 \
-t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:${LOCAL_VERSION} .

create-uds-gpu-cluster: build-k3d-gpu
@uds deploy k3d-core-slim-dev:${UDS_VERSION} \
--set K3D_EXTRA_ARGS="--gpus=all \
--image=ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:${LOCAL_VERSION}" --confirm

test-uds-gpu-cluster:
@cd ${MAKEFILE_DIR} && \
uds zarf tools kubectl apply -f ./test/cuda-vector-add.yaml
@uds zarf tools kubectl wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=15s pod -l app=gpu-pod
@uds zarf tools kubectl logs -l app=gpu-pod
@cd ${MAKEFILE_DIR} && \
uds zarf tools kubectl delete -f ./test/cuda-vector-add.yaml

.PHONY: build-k3d-gpu create-uds-gpu-cluster test-uds-gpu-cluster
28 changes: 28 additions & 0 deletions packages/k3d-gpu/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# K3D GPU

Prepares `k3s` + `nvidia/cuda` base image that enables a K3D cluster to have access to your host machine's NVIDIA, CUDA-capable GPU(s).

## Pre-Requisites

* Docker: https://www.docker.com/
* K3D: https://k3d.io/
* UDS-CLI: https://github.com/defenseunicorns/uds-cli
* Modern NVIDIA GPU with CUDA cores and drivers must be present. Additionally, the CUDA toolkit and NVIDIA container toolkit must be installed.

## Usage

Check out the Make targets for the various options.

### Local

```shell
make build-k3d-gpu # build the image

make create-uds-gpu-cluster # create a uds cluster equipped with the k3d-gpu image

make test-uds-gpu-cluster # deploy a test gpu pod to see if everything is working
```

## References

* https://k3d.io/v5.7.2/usage/advanced/cuda/
61 changes: 61 additions & 0 deletions packages/k3d-gpu/plugin/device-plugin-daemonset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
name: nvidia
handler: nvidia
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
spec:
selector:
matchLabels:
name: nvidia-device-plugin-daemonset
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nvidia-device-plugin-daemonset
spec:
runtimeClassName: nvidia # Explicitly request the runtime
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
containers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2
name: nvidia-device-plugin-ctr
env:
- name: PASS_DEVICE_SPECS
value: "true"
- name: FAIL_ON_INIT_ERROR
value: "true"
- name: DEVICE_LIST_STRATEGY
value: envvar
- name: DEVICE_ID_STRATEGY
value: uuid
- name: NVIDIA_VISIBLE_DEVICES
value: all
- name: NVIDIA_DRIVER_CAPABILITIES
value: all
- name: MPS_ROOT
value: /run/nvidia/mps
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
21 changes: 21 additions & 0 deletions packages/k3d-gpu/test/cuda-vector-add.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
apiVersion: v1
kind: Pod
metadata:
name: gpu-pod
labels:
app: gpu-pod
spec:
runtimeClassName: nvidia
restartPolicy: Never
containers:
- name: cuda-container
image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2
resources:
limits:
nvidia.com/gpu: "1" # requesting 1 GPU
cpu: "1"
memory: 0.5Gi
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule

0 comments on commit 4504085

Please sign in to comment.