Skip to content

Commit

Permalink
Merge pull request #1799 from arnaldo2792/nvidia-variant
Browse files Browse the repository at this point in the history
Add aws-k8s-1.21-nvidia variant
  • Loading branch information
arnaldo2792 authored Jan 26, 2022
2 parents 3cbdcbd + 54415cb commit 8772387
Show file tree
Hide file tree
Showing 38 changed files with 558 additions and 3 deletions.
31 changes: 30 additions & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,30 +27,59 @@ jobs:
variant: [aws-k8s-1.18, aws-k8s-1.19, aws-k8s-1.20, aws-k8s-1.21, aws-ecs-1]
arch: [x86_64, aarch64]
supported: [true]
fetch-upstream: ["false"]
include:
- variant: aws-dev
arch: x86_64
supported: false
fetch-upstream: "false"
- variant: vmware-dev
arch: x86_64
supported: false
fetch-upstream: "false"
- variant: metal-dev
arch: x86_64
supported: false
fetch-upstream: "false"
- variant: metal-k8s-1.21
arch: x86_64
supported: false
fetch-upstream: "false"
- variant: vmware-k8s-1.20
arch: x86_64
supported: true
fetch-upstream: "false"
- variant: vmware-k8s-1.21
arch: x86_64
supported: true
fetch-upstream: "false"
- variant: aws-k8s-1.21-nvidia
arch: x86_64
supported: true
fetch-upstream: "true"
- variant: aws-k8s-1.21-nvidia
arch: aarch64
supported: true
fetch-upstream: "true"
fail-fast: false
steps:
- uses: actions/checkout@v2
- run: rustup toolchain install 1.58.1 && rustup default 1.58.1
- run: cargo install --version 0.35.8 cargo-make
- if: contains(matrix.variant, 'nvidia')
run: |
cat <<-EOF > Licenses.toml
[nvidia]
spdx-id = "LICENSE-LicenseRef-NVIDIA-Customer"
licenses = [
{ path = "NVIDIA", license-url = "https://www.nvidia.com/en-us/drivers/nvidia-license/" }
]
EOF
- run: cargo make -e BUILDSYS_VARIANT=${{ matrix.variant }} unit-tests
- run: cargo make -e BUILDSYS_VARIANT=${{ matrix.variant }} check-fmt
- run: cargo make -e BUILDSYS_VARIANT=${{ matrix.variant }} -e BUILDSYS_ARCH=${{ matrix.arch }} -e BUILDSYS_JOBS=12
- run: |
cargo make -e BUILDSYS_VARIANT=${{ matrix.variant }} \
-e BUILDSYS_ARCH=${{ matrix.arch }} \
-e BUILDSYS_JOBS=12 \
-e BUILDSYS_UPSTREAM_SOURCE_FALLBACK=${{ matrix.fetch-upstream }} \
-e BUILDSYS_UPSTREAM_LICENSE_FETCH=${{ matrix.fetch-upstream }}
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@
/roles
/Licenses.toml
/licenses
*.run
28 changes: 28 additions & 0 deletions BUILDING.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,34 @@ licenses = [
]
```

#### NVIDIA variants

If you want to build the `aws-k8s-1.21-nvidia` variant, you can follow these steps to prepare a `Licenses.toml` file using the [License for customer use of NVIDIA software](https://www.nvidia.com/en-us/drivers/nvidia-license/):

1. Create a `Licenses.toml` file in your Bottlerocket root directory, with the following content:

```toml
[nvidia]
spdx-id = "LicensesRef-NVIDIA-Customer-Use"
licenses = [
{ path = "LICENSE", license-url = "https://www.nvidia.com/en-us/drivers/nvidia-license/" }
]
```

2. Fetch the licenses with this command:

```shell
cargo make fetch-licenses -e BUILDSYS_UPSTREAM_LICENSES_FETCH=true
```

3. Build your image, setting the `BUILDSYS_UPSTREAM_SOURCE_FALLBACK` flag to `true`, if you haven't cached the driver's sources:

```shell
cargo make \
-e BUILDSYS_VARIANT=aws-k8s-1.21-nvidia \
-e BUILDSYS_UPSTREAM_SOURCE_FALLBACK="true"
```

### Register an AMI

To use the image in Amazon EC2, we need to register the image as an AMI.
Expand Down
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ RUN rpmdev-setuptree \
&& cat ${ARCH} shared rust cargo > .rpmmacros \
&& echo "%_cross_variant ${VARIANT}" >> .rpmmacros \
&& echo "%_cross_repo_root_json %{_builddir}/root.json" >> .rpmmacros \
&& echo "%_topdir /home/builder/rpmbuild" >> .rpmmacros \
&& rm ${ARCH} shared rust cargo \
&& mv *.spec rpmbuild/SPECS \
&& find . -maxdepth 1 -not -path '*/\.*' -type f -exec mv {} rpmbuild/SOURCES/ \; \
Expand Down
14 changes: 14 additions & 0 deletions QUICKSTART-EKS.md
Original file line number Diff line number Diff line change
Expand Up @@ -369,3 +369,17 @@ Once it launches, you should be able to run pods on your Bottlerocket instance u

For example, to run busybox:
`kubectl run -i -t busybox --image=busybox --restart=Never`

### aws-k8s-1.21-nvidia variant

The `aws-k8s-1.21-nvidia` variant includes the required packages and configurations to leverage NVIDIA GPUs.
It comes with the [NVIDIA Tesla driver](https://docs.nvidia.com/datacenter/tesla/drivers/index.html) along with the libraries required by the [CUDA toolkit](https://developer.nvidia.com/cuda-toolkit) included in your orchestrated containers.
It also includes the [NVIDIA k8s device plugin](https://github.com/NVIDIA/k8s-device-plugin).
If you already have a daemonset for the device plugin in your cluster, you may need to use taints and tolerations to keep it from running on Bottlerocket nodes.

Additional NVIDIA tools such as [DCGM](https://github.com/NVIDIA/dcgm-exporter) and [GPU Feature Discovery](https://github.com/NVIDIA/gpu-feature-discovery) will work as expected.
You can install them in your cluster by following the `helm install` instructions provided for each project.

The [GPU Operator](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html#install-nvidia-gpu-operator) can also be used to install these tools.
However, it is cumbersome to select the right subset of features to avoid conflicts with the software included in the variant.
Therefore we recommend installing the tools individually if they are required.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ The following variants support EKS, as described above:
- `aws-k8s-1.19`
- `aws-k8s-1.20`
- `aws-k8s-1.21`
- `aws-k8s-1.21-nvidia`

The following variant supports ECS:

Expand Down
2 changes: 2 additions & 0 deletions SECURITY_FEATURES.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ All binaries are linked with the following options:

Together these enable [full RELRO support](https://www.redhat.com/en/blog/hardening-elf-binaries-using-relocation-read-only-relro) which makes [ROP](https://en.wikipedia.org/wiki/Return-oriented_programming) attacks more difficult to execute.

**Note:** Certain variants, such as the ones for NVIDIA, include precompiled binaries that may not have been built with these hardening flags.

### SELinux enabled in enforcing mode

Bottlerocket enables SELinux by default, sets it to enforcing mode, and loads the policy during boot.
Expand Down
39 changes: 39 additions & 0 deletions packages/containerd/containerd-config-toml_k8s_nvidia
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
version = 2
root = "/var/lib/containerd"
state = "/run/containerd"
disabled_plugins = [
"io.containerd.internal.v1.opt",
"io.containerd.snapshotter.v1.aufs",
"io.containerd.snapshotter.v1.devmapper",
"io.containerd.snapshotter.v1.native",
"io.containerd.snapshotter.v1.zfs",
]

[grpc]
address = "/run/dockershim.sock"

[plugins."io.containerd.grpc.v1.cri"]
enable_selinux = true
# Pause container image is specified here, shares the same image as kubelet's pod-infra-container-image
sandbox_image = "{{settings.kubernetes.pod-infra-container-image}}"

[plugins."io.containerd.grpc.v1.cri".containerd]
default_runtime_name = "nvidia"

[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
runtime_type = "io.containerd.runc.v2"

[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
SystemdCgroup = true
BinaryName = "nvidia-oci"

[plugins."io.containerd.grpc.v1.cri".cni]
bin_dir = "/opt/cni/bin"
conf_dir = "/etc/cni/net.d"

{{#if settings.container-registry.mirrors}}
{{#each settings.container-registry.mirrors}}
[plugins."io.containerd.grpc.v1.cri".registry.mirrors."{{registry}}"]
endpoint = [{{join_array ", " endpoint }}]
{{/each}}
{{/if}}
1 change: 1 addition & 0 deletions packages/containerd/containerd.service
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Wants=network-online.target configured.target
[Service]
Slice=runtime.slice
EnvironmentFile=/etc/network/proxy.env
EnvironmentFile=-/etc/containerd/nvidia.env
ExecStart=/usr/bin/containerd
Type=notify
Delegate=yes
Expand Down
3 changes: 2 additions & 1 deletion packages/containerd/containerd.spec
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Source0: https://%{goimport}/archive/v%{gover}/%{gorepo}-%{gover}.tar.gz
Source1: containerd.service
Source2: containerd-config-toml_k8s
Source3: containerd-config-toml_basic
Source4: containerd-config-toml_k8s_nvidia
Source5: containerd-tmpfiles.conf
Source1000: clarify.toml

Expand Down Expand Up @@ -75,7 +76,7 @@ install -p -m 0644 %{S:1} %{buildroot}%{_cross_unitdir}/containerd.service

install -d %{buildroot}%{_cross_templatedir}
install -d %{buildroot}%{_cross_factorydir}%{_cross_sysconfdir}/containerd
install -p -m 0644 %{S:2} %{S:3} %{buildroot}%{_cross_templatedir}
install -p -m 0644 %{S:2} %{S:3} %{S:4} %{buildroot}%{_cross_templatedir}

install -d %{buildroot}%{_cross_tmpfilesdir}
install -p -m 0644 %{S:5} %{buildroot}%{_cross_tmpfilesdir}/containerd.conf
Expand Down
24 changes: 24 additions & 0 deletions packages/kmod-5.10-nvidia/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
[package]
name = "kmod-5_10-nvidia"
version = "0.1.0"
edition = "2018"
publish = false
build = "build.rs"

[lib]
path = "pkg.rs"

[package.metadata.build-package]
package-name = "kmod-5.10-nvidia"

[[package.metadata.build-package.external-files]]
url = "https://us.download.nvidia.com/tesla/470.82.01/NVIDIA-Linux-x86_64-470.82.01.run"
sha512 = "86eac5e2d4fae5525a9332b77da58c0c12e76a35db023a2b14de7d9615b20ba4850a04fa189189c0dcf712f1f343fee98b954aaa6e9b83a959de3c3b8259c7c2"

[[package.metadata.build-package.external-files]]
url = "https://us.download.nvidia.com/tesla/470.82.01/NVIDIA-Linux-aarch64-470.82.01.run"
sha512 = "62c4adf6fa3c3474c3a09c08ed8056d4e9d00a90effa3851add10d6b2603c23f9986c32ace2e1b2ed7b735779430d634856a06e93af41431db439dfc79503cd8"

[build-dependencies]
glibc = { path = "../glibc" }
kernel-5_10 = { path = "../kernel-5.10" }
9 changes: 9 additions & 0 deletions packages/kmod-5.10-nvidia/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
use std::process::{exit, Command};

fn main() -> Result<(), std::io::Error> {
let ret = Command::new("buildsys").arg("build-package").status()?;
if !ret.success() {
exit(1);
}
Ok(())
}
Loading

0 comments on commit 8772387

Please sign in to comment.