bottlerocket-os · arnaldo2792 · Jan 26, 2022 · Jan 24, 2022 · Nov 24, 2021 · Oct 8, 2021
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -27,30 +27,59 @@ jobs:
         variant: [aws-k8s-1.18, aws-k8s-1.19, aws-k8s-1.20, aws-k8s-1.21, aws-ecs-1]
         arch: [x86_64, aarch64]
         supported: [true]
+        fetch-upstream: ["false"]
         include:
           - variant: aws-dev
             arch: x86_64
             supported: false
+            fetch-upstream: "false"
           - variant: vmware-dev
             arch: x86_64
             supported: false
+            fetch-upstream: "false"
           - variant: metal-dev
             arch: x86_64
             supported: false
+            fetch-upstream: "false"
           - variant: metal-k8s-1.21
             arch: x86_64
             supported: false
+            fetch-upstream: "false"
           - variant: vmware-k8s-1.20
             arch: x86_64
             supported: true
+            fetch-upstream: "false"
           - variant: vmware-k8s-1.21
             arch: x86_64
             supported: true
+            fetch-upstream: "false"
+          - variant: aws-k8s-1.21-nvidia
+            arch: x86_64
+            supported: true
+            fetch-upstream: "true"
+          - variant: aws-k8s-1.21-nvidia
+            arch: aarch64
+            supported: true
+            fetch-upstream: "true"
       fail-fast: false
     steps:
       - uses: actions/checkout@v2
       - run: rustup toolchain install 1.58.0 && rustup default 1.58.0
       - run: cargo install --version 0.35.8 cargo-make
+      - if: contains(matrix.variant, 'nvidia')
+        run: |
+          cat <<-EOF > Licenses.toml
+          [nvidia]
+          spdx-id = "LICENSE-LicenseRef-NVIDIA-Customer"
+          licenses = [
+            { path = "NVIDIA", license-url = "https://www.nvidia.com/en-us/drivers/nvidia-license/" }
+          ]
+          EOF
       - run: cargo make -e BUILDSYS_VARIANT=${{ matrix.variant }} unit-tests
       - run: cargo make -e BUILDSYS_VARIANT=${{ matrix.variant }} check-fmt
-      - run: cargo make -e BUILDSYS_VARIANT=${{ matrix.variant }} -e BUILDSYS_ARCH=${{ matrix.arch }} -e BUILDSYS_JOBS=12
+      - run: |
+          cargo make -e BUILDSYS_VARIANT=${{ matrix.variant }} \
+            -e BUILDSYS_ARCH=${{ matrix.arch }} \
+            -e BUILDSYS_JOBS=12 \
+            -e BUILDSYS_UPSTREAM_SOURCE_FALLBACK=${{ matrix.fetch-upstream }} \
+            -e BUILDSYS_UPSTREAM_LICENSE_FETCH=${{ matrix.fetch-upstream }}
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,4 @@
 /roles
 /Licenses.toml
 /licenses
+*.run
diff --git a/BUILDING.md b/BUILDING.md
@@ -121,6 +121,34 @@ licenses = [
 ]
 ```
 
+#### NVIDIA variants
+
+If you want to build the `aws-k8s-1.21-nvidia` variant, you can follow these steps to prepare a `Licenses.toml` file using the [License for customer use of NVIDIA software](https://www.nvidia.com/en-us/drivers/nvidia-license/):
+
+1. Create a `Licenses.toml` file in your Bottlerocket root directory, with the following content:
+
+```toml
+[nvidia]
+spdx-id = "LicensesRef-NVIDIA-Customer-Use"
+licenses = [
+  { path = "LICENSE", license-url = "https://www.nvidia.com/en-us/drivers/nvidia-license/" }
+]
+```
+
+2. Fetch the licenses with this command:
+
+```shell
+cargo make fetch-licenses -e BUILDSYS_UPSTREAM_LICENSES_FETCH=true
+```
+
+3. Build your image, setting the `BUILDSYS_UPSTREAM_SOURCE_FALLBACK` flag to `true`, if you haven't cached the driver's sources:
+
+```shell
+cargo make \
+  -e BUILDSYS_VARIANT=aws-k8s-1.21-nvidia \
+  -e BUILDSYS_UPSTREAM_SOURCE_FALLBACK="true"
+```
+
 ### Register an AMI
 
 To use the image in Amazon EC2, we need to register the image as an AMI.

diff --git a/Dockerfile b/Dockerfile
@@ -83,6 +83,7 @@ RUN rpmdev-setuptree \
    && cat ${ARCH} shared rust cargo > .rpmmacros \
    && echo "%_cross_variant ${VARIANT}" >> .rpmmacros \
    && echo "%_cross_repo_root_json %{_builddir}/root.json" >> .rpmmacros \
+   && echo "%_topdir /home/builder/rpmbuild" >> .rpmmacros \
    && rm ${ARCH} shared rust cargo \
    && mv *.spec rpmbuild/SPECS \
    && find . -maxdepth 1 -not -path '*/\.*' -type f -exec mv {} rpmbuild/SOURCES/ \; \

diff --git a/QUICKSTART-EKS.md b/QUICKSTART-EKS.md
@@ -369,3 +369,17 @@ Once it launches, you should be able to run pods on your Bottlerocket instance u
 
 For example, to run busybox:
 `kubectl run -i -t busybox --image=busybox --restart=Never`
+
+### aws-k8s-1.21-nvidia variant
+
+The `aws-k8s-1.21-nvidia` variant includes the required packages and configurations to leverage NVIDIA GPUs.
+It comes with the [NVIDIA Tesla driver](https://docs.nvidia.com/datacenter/tesla/drivers/index.html) along with the libraries required by the [CUDA toolkit](https://developer.nvidia.com/cuda-toolkit) included in your orchestrated containers.
+It also includes the [NVIDIA k8s device plugin](https://github.com/NVIDIA/k8s-device-plugin).
+If you already have a daemonset for the device plugin in your cluster, you may need to use taints and tolerations to keep it from running on Bottlerocket nodes.
+
+Additional NVIDIA tools such as [DCGM](https://github.com/NVIDIA/dcgm-exporter) and [GPU Feature Discovery](https://github.com/NVIDIA/gpu-feature-discovery) will work as expected.
+You can install them in your cluster by following the `helm install` instructions provided for each project.
+
+The [GPU Operator](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html#install-nvidia-gpu-operator) can also be used to install these tools.
+However, it is cumbersome to select the right subset of features to avoid conflicts with the software included in the variant.
+Therefore we recommend installing the tools individually if they are required.
diff --git a/README.md b/README.md
@@ -54,6 +54,7 @@ The following variants support EKS, as described above:
 - `aws-k8s-1.19`
 - `aws-k8s-1.20`
 - `aws-k8s-1.21`
+- `aws-k8s-1.21-nvidia`
 
 The following variant supports ECS:
 

diff --git a/SECURITY_FEATURES.md b/SECURITY_FEATURES.md
@@ -134,6 +134,8 @@ All binaries are linked with the following options:
 
 Together these enable [full RELRO support](https://www.redhat.com/en/blog/hardening-elf-binaries-using-relocation-read-only-relro) which makes [ROP](https://en.wikipedia.org/wiki/Return-oriented_programming) attacks more difficult to execute.
 
+**Note:** Certain variants, such as the ones for NVIDIA, include precompiled binaries that may not have been built with these hardening flags.
+
 ### SELinux enabled in enforcing mode
 
 Bottlerocket enables SELinux by default, sets it to enforcing mode, and loads the policy during boot.

diff --git a/packages/containerd/containerd-config-toml_k8s_nvidia b/packages/containerd/containerd-config-toml_k8s_nvidia
@@ -0,0 +1,39 @@
+version = 2
+root = "/var/lib/containerd"
+state = "/run/containerd"
+disabled_plugins = [
+    "io.containerd.internal.v1.opt",
+    "io.containerd.snapshotter.v1.aufs",
+    "io.containerd.snapshotter.v1.devmapper",
+    "io.containerd.snapshotter.v1.native",
+    "io.containerd.snapshotter.v1.zfs",
+]
+
+[grpc]
+address = "/run/dockershim.sock"
+
+[plugins."io.containerd.grpc.v1.cri"]
+enable_selinux = true
+# Pause container image is specified here, shares the same image as kubelet's pod-infra-container-image
+sandbox_image = "{{settings.kubernetes.pod-infra-container-image}}"
+
+[plugins."io.containerd.grpc.v1.cri".containerd]
+default_runtime_name = "nvidia"
+
+[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
+runtime_type = "io.containerd.runc.v2"
+
+[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
+SystemdCgroup = true
+BinaryName = "nvidia-oci"
+
+[plugins."io.containerd.grpc.v1.cri".cni]
+bin_dir = "/opt/cni/bin"
+conf_dir = "/etc/cni/net.d"
+
+{{#if settings.container-registry.mirrors}}
+{{#each settings.container-registry.mirrors}}
+[plugins."io.containerd.grpc.v1.cri".registry.mirrors."{{registry}}"]
+endpoint = [{{join_array ", " endpoint }}]
+{{/each}}
+{{/if}}
diff --git a/packages/containerd/containerd.service b/packages/containerd/containerd.service
@@ -7,6 +7,7 @@ Wants=network-online.target configured.target
 [Service]
 Slice=runtime.slice
 EnvironmentFile=/etc/network/proxy.env
+EnvironmentFile=-/etc/containerd/nvidia.env
 ExecStart=/usr/bin/containerd
 Type=notify
 Delegate=yes

diff --git a/packages/containerd/containerd.spec b/packages/containerd/containerd.spec
@@ -18,6 +18,7 @@ Source0: https://%{goimport}/archive/v%{gover}/%{gorepo}-%{gover}.tar.gz
 Source1: containerd.service
 Source2: containerd-config-toml_k8s
 Source3: containerd-config-toml_basic
+Source4: containerd-config-toml_k8s_nvidia
 Source5: containerd-tmpfiles.conf
 Source1000: clarify.toml
 
@@ -75,7 +76,7 @@ install -p -m 0644 %{S:1} %{buildroot}%{_cross_unitdir}/containerd.service
 
 install -d %{buildroot}%{_cross_templatedir}
 install -d %{buildroot}%{_cross_factorydir}%{_cross_sysconfdir}/containerd
-install -p -m 0644 %{S:2} %{S:3} %{buildroot}%{_cross_templatedir}
+install -p -m 0644 %{S:2} %{S:3} %{S:4} %{buildroot}%{_cross_templatedir}
 
 install -d %{buildroot}%{_cross_tmpfilesdir}
 install -p -m 0644 %{S:5} %{buildroot}%{_cross_tmpfilesdir}/containerd.conf

diff --git a/packages/kmod-5.10-nvidia/Cargo.toml b/packages/kmod-5.10-nvidia/Cargo.toml
@@ -0,0 +1,24 @@
+[package]
+name = "kmod-5_10-nvidia"
+version = "0.1.0"
+edition = "2018"
+publish = false
+build = "build.rs"
+
+[lib]
+path = "pkg.rs"
+
+[package.metadata.build-package]
+package-name = "kmod-5.10-nvidia"
+
+[[package.metadata.build-package.external-files]]
+url = "https://us.download.nvidia.com/tesla/470.82.01/NVIDIA-Linux-x86_64-470.82.01.run"
+sha512 = "86eac5e2d4fae5525a9332b77da58c0c12e76a35db023a2b14de7d9615b20ba4850a04fa189189c0dcf712f1f343fee98b954aaa6e9b83a959de3c3b8259c7c2"
+
+[[package.metadata.build-package.external-files]]
+url = "https://us.download.nvidia.com/tesla/470.82.01/NVIDIA-Linux-aarch64-470.82.01.run"
+sha512 = "62c4adf6fa3c3474c3a09c08ed8056d4e9d00a90effa3851add10d6b2603c23f9986c32ace2e1b2ed7b735779430d634856a06e93af41431db439dfc79503cd8"
+
+[build-dependencies]
+glibc = { path = "../glibc" }
+kernel-5_10 = { path = "../kernel-5.10" }
diff --git a/packages/kmod-5.10-nvidia/build.rs b/packages/kmod-5.10-nvidia/build.rs
@@ -0,0 +1,9 @@
+use std::process::{exit, Command};
+
+fn main() -> Result<(), std::io::Error> {
+    let ret = Command::new("buildsys").arg("build-package").status()?;
+    if !ret.success() {
+        exit(1);
+    }
+    Ok(())
+}