elixir-nx · jonatanklosko · Aug 12, 2024 · Aug 12, 2024
diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ only the host CPU. If a matching CUDA version is detected, the target is set to
 | --- | --- |
 | cpu | |
 | tpu | libtpu |
-| cuda12 | CUDA 12.1+, cuDNN 8.9+ and < 9 |
+| cuda12 | CUDA >= 12.1, cuDNN >= 9.1 and < 10.0 |
 | cuda | CUDA x.y, cuDNN (building from source only) |
 | rocm | ROCm (building from source only) |
 

diff --git a/builds/Dockerfile b/builds/Dockerfile
@@ -0,0 +1,86 @@
+ARG VARIANT
+ARG BASE_IMAGE="hexpm/elixir:1.17.2-erlang-27.0.1-ubuntu-focal-20240530"
+
+# Pre-stages for base image variants
+
+FROM ${BASE_IMAGE} AS base-cpu
+
+FROM ${BASE_IMAGE} AS base-cuda
+
+ARG CUDA_VERSION
+ARG CUDNN_VERSION
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+RUN distro="ubuntu$(. /etc/lsb-release; echo "$DISTRIB_RELEASE" | tr -d '.')" && \
+  # Official Docker images use the sbsa packages when targetting arm64.
+  # See https://gitlab.com/nvidia/container-images/cuda/-/blob/85f465ea3343a2d7f7753a0a838701999ed58a01/dist/12.5.1/ubuntu2204/base/Dockerfile#L12
+  arch="$(if [ "$(uname -m)" = "aarch64" ]; then echo "sbsa"; else echo "x86_64"; fi)" && \
+  apt-get update && apt-get install -y ca-certificates wget && \
+  wget -qO /tmp/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$distro/$arch/cuda-keyring_1.1-1_all.deb && \
+  dpkg -i /tmp/cuda-keyring.deb && apt-get update && \
+  apt-get install -y git cuda-toolkit-${CUDA_VERSION} libcudnn9-cuda-12=${CUDNN_VERSION}-1 libcudnn9-dev-cuda-12=${CUDNN_VERSION}-1 && \
+  apt-get clean -y && rm -rf /var/lib/apt/lists/*
+
+FROM ${BASE_IMAGE} AS base-rocm
+
+ARG ROCM_VERSION
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates curl gnupg && \
+  distro="$(. /etc/lsb-release && echo "$DISTRIB_CODENAME")" && \
+  curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
+  echo "deb [arch=amd64] https://repo.radeon.com/rocm/apt/${ROCM_VERSION}/ $distro main" | tee /etc/apt/sources.list.d/rocm.list && \
+  printf 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600\n' | tee /etc/apt/preferences.d/rocm-pin-600 && \
+  apt-get update && \
+  apt-get install -y rocm-dev rocm-libs && \
+  apt-get clean -y && rm -rf /var/lib/apt/lists/*
+
+ENV ROCM_PATH "/opt/rocm-${ROCM_VERSION}.0"
+
+FROM base-${VARIANT}
+
+# Set the missing UTF-8 locale, otherwise Elixir warns
+ENV LC_ALL C.UTF-8
+
+# Make sure installing packages (like tzdata) doesn't prompt for configuration
+ARG DEBIAN_FRONTEND=noninteractive
+
+# We need to install "add-apt-repository" first
+RUN apt-get update && apt-get install -y software-properties-common && \
+  # Add repository with the latest git version
+  add-apt-repository ppa:git-core/ppa && \
+  # Install basic system dependencies
+  apt-get update && apt-get install -y ca-certificates curl git unzip wget && \
+  apt-get clean -y && rm -rf /var/lib/apt/lists/*
+
+# Install Bazel using Bazelisk (works for both amd and arm)
+RUN wget -O bazel "https://github.com/bazelbuild/bazelisk/releases/download/v1.18.0/bazelisk-linux-$(dpkg --print-architecture)" && \
+  chmod +x bazel && \
+  mv bazel /usr/local/bin/bazel
+
+ENV USE_BAZEL_VERSION 6.5.0
+
+# Install Python and the necessary global dependencies
+RUN apt-get update && apt-get install -y python3 python3-pip && \
+  ln -s /usr/bin/python3 /usr/bin/python && \
+  python -m pip install --upgrade pip numpy && \
+  apt-get clean -y && rm -rf /var/lib/apt/lists/*
+
+# Setup project files
+
+ARG XLA_TARGET
+
+ENV XLA_TARGET=${XLA_TARGET}
+ENV XLA_CACHE_DIR=/build
+ENV XLA_BUILD=true
+
+COPY mix.exs mix.lock ./
+RUN mix deps.get
+
+COPY lib lib
+COPY Makefile ./
+COPY extension extension
+
+CMD [ "mix", "compile" ]
diff --git a/builds/README.md b/builds/README.md
@@ -4,7 +4,7 @@ This directory contains Docker-based automated builds to run off-CI.
 
 ## Usage
 
-Run the build script, passing one of the defined variants.
+Run the build script, passing one of the defined targets.
 
 ```shell
 ./build.sh cuda12

diff --git a/builds/build.sh b/builds/build.sh
@@ -5,73 +5,57 @@ set -e
 cd "$(dirname "$0")/.."
 
 print_usage_and_exit() {
-  echo "Usage: $0 <variant>"
+  echo "Usage: $0 <target>"
   echo ""
-  echo "Compiles the project inside docker. Available variants: cpu, cuda12, tpu, rocm."
+  echo "Compiles the project inside docker. Available targets: cpu, cuda12, tpu, rocm."
   exit 1
 }
 
 if [ $# -ne 1 ]; then
   print_usage_and_exit
 fi
 
-# For cuDNN support matrix see [1]. When precompiling, we want to use
-# the lowest cuDNN that supports the given CUDA version.
-#
-# [1]: https://docs.nvidia.com/deeplearning/cudnn/archives/index.html
+target="$1"
 
-case "$1" in
+case "$target" in
   "cpu")
-    docker build -t xla-cpu -f builds/cpu.Dockerfile \
+    docker build -t xla-cpu -f builds/Dockerfile \
+      --build-arg VARIANT=cuda \
       --build-arg XLA_TARGET=cpu \
       .
-
-    docker run --rm \
-      -v $(pwd)/builds/output/cpu/build:/build \
-      -v $(pwd)/builds/output/cpu/.cache:/root/.cache \
-      $XLA_DOCKER_FLAGS \
-      xla-cpu
   ;;
 
   "tpu")
-    docker build -t xla-tpu -f builds/cpu.Dockerfile \
+    docker build -t xla-tpu -f builds/Dockerfile \
+      --build-arg VARIANT=cpu \
       --build-arg XLA_TARGET=tpu \
       .
-
-    docker run --rm \
-      -v $(pwd)/builds/output/tpu/build:/build \
-      -v $(pwd)/builds/output/tpu/.cache:/root/.cache \
-      $XLA_DOCKER_FLAGS \
-      xla-tpu
   ;;
 
   "cuda12")
-    docker build -t xla-cuda12 -f builds/cuda.Dockerfile \
-      --build-arg CUDA_VERSION=12.1.0 \
-      --build-arg CUDNN_VERSION=8.9.0 \
+    docker build -t xla-cuda12 -f builds/Dockerfile \
+      --build-arg VARIANT=cuda \
+      --build-arg CUDA_VERSION=12-3 \
+      --build-arg CUDNN_VERSION=9.1.1.17 \
       --build-arg XLA_TARGET=cuda12 \
       .
-
-    docker run --rm \
-      -v $(pwd)/builds/output/cuda12/build:/build \
-      -v $(pwd)/builds/output/cuda12/.cache:/root/.cache \
-      $XLA_DOCKER_FLAGS \
-      xla-cuda12
   ;;
 
   "rocm")
-    docker build -t xla-rocm -f builds/rocm.Dockerfile \
+    docker build -t xla-rocm -f builds/Dockerfile \
+      --build-arg VARIANT=rocm \
+      --build-arg ROCM_VERSION=6.0 \
       --build-arg XLA_TARGET=rocm \
       .
-
-    docker run --rm \
-      -v $(pwd)/builds/output/rocm/build:/build \
-      -v $(pwd)/builds/output/rocm/.cache:/root/.cache \
-      $XLA_DOCKER_FLAGS \
-      xla-rocm
   ;;
 
   *)
     print_usage_and_exit
   ;;
 esac
+
+docker run --rm \
+  -v $(pwd)/builds/output/$target/build:/build \
+  -v $(pwd)/builds/output/$target/.cache:/root/.cache \
+  $XLA_DOCKER_FLAGS \
+  xla-$target
diff --git a/builds/cpu.Dockerfile b/builds/cpu.Dockerfile
diff --git a/builds/cuda.Dockerfile b/builds/cuda.Dockerfile
diff --git a/builds/rocm.Dockerfile b/builds/rocm.Dockerfile