- https://docs.google.com/document/d/1H-ddA11laPQf_1olwXRjEDbzNihxprjPr74pZ4Vdf2M/edit?usp=sharingmps
- https://docs.nvidia.com/deploy/mps/index.html
- https://docs.google.com/document/d/1xZrrMBJV00VxW9XK8AMXU0QT01JGqn-xhGwexenhPSE/edit#heading=h.jw5js7865egx
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/libnvidia-container/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | sudo tee /etc/apt/sources.list.d/libnvidia-container.list
sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
sudo add-apt-repository ppa:graphics-drivers/ppa && sudo apt-get update
lspci | grep -i nvidia
ubuntu-drivers devices
ubuntu-drivers autoinstall
nvidia-smi
/usr/local/bin/k3s-uninstall.sh
/usr/local/bin/k3s-agent-uninstall.sh
cat <<EOF | kubectl apply -f -
apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
name: nvidia
handler: nvidia
EOF
# kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.16.1/nvidia-device-plugin.yml
cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
spec:
selector:
matchLabels:
name: nvidia-device-plugin-ds
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nvidia-device-plugin-ds
spec:
runtimeClassName: nvidia # Added 😈
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
priorityClassName: "system-node-critical"
containers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.12.2
name: nvidia-device-plugin-ctr
env:
- name: FAIL_ON_INIT_ERROR
value: "false"
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
EOF
cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
name: nvidia-deployment
namespace: default
spec:
replicas: 1
selector:
matchLabels:
app: nvidia
template:
metadata:
labels:
app: nvidia
spec:
runtimeClassName: nvidia
restartPolicy: Always
containers:
- name: nvidia
image: "nvidia/cuda:12.6.1-base-ubuntu22.04"
command: [ "/bin/bash", "-c", "--" ]
args: [ "while true; do sleep 30; done;" ]
resources:
limits:
nvidia.com/gpu: 1
EOF
curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | bash
helm version
# If you have installed the CRDs manually instead of with the `--set installCRDs=true` option added to your Helm install command, you should upgrade your CRD resources before upgrading the Helm chart:
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.7.1/cert-manager.crds.yaml
# Add the Jetstack Helm repository
helm repo add jetstack https://charts.jetstack.io
# Update your local Helm chart repository cache
helm repo update
# Install the cert-manager Helm chart
helm install cert-manager jetstack/cert-manager \
--namespace cert-manager \
--create-namespace \
--version v1.7.1
helm repo add rancher-stable https://releases.rancher.com/server-charts/stable
helm repo update
echo "export KUBECONFIG=/etc/rancher/k3s/k3s.yaml" >> ~/.bash_profile
source .bash_profile
kubectl create namespace cattle-system
helm install rancher rancher-stable/rancher \
--namespace cattle-system \
--set hostname=rancher.x.run \
--set bootstrapPassword=admin
k3s kubectl get pods -A
kubectl describe node