From c215f0e3f032afe653930e198c42a8a9b8b2473d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonatan=20K=C5=82osko?= Date: Tue, 13 Aug 2024 01:56:08 +0900 Subject: [PATCH] Precompile for cuDNN 9.1+ and simplify build scripts --- README.md | 2 +- builds/Dockerfile | 86 ++++++++++++++++++++++++++++++++++++++++++ builds/README.md | 2 +- builds/build.sh | 58 +++++++++++----------------- builds/cpu.Dockerfile | 43 --------------------- builds/cuda.Dockerfile | 66 -------------------------------- builds/rocm.Dockerfile | 60 ----------------------------- 7 files changed, 109 insertions(+), 208 deletions(-) create mode 100644 builds/Dockerfile delete mode 100644 builds/cpu.Dockerfile delete mode 100644 builds/cuda.Dockerfile delete mode 100644 builds/rocm.Dockerfile diff --git a/README.md b/README.md index b943681..709a1ef 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ only the host CPU. If a matching CUDA version is detected, the target is set to | --- | --- | | cpu | | | tpu | libtpu | -| cuda12 | CUDA 12.1+, cuDNN 8.9+ and < 9 | +| cuda12 | CUDA >= 12.1, cuDNN >= 9.1 and < 10.0 | | cuda | CUDA x.y, cuDNN (building from source only) | | rocm | ROCm (building from source only) | diff --git a/builds/Dockerfile b/builds/Dockerfile new file mode 100644 index 0000000..3e63736 --- /dev/null +++ b/builds/Dockerfile @@ -0,0 +1,86 @@ +ARG VARIANT +ARG BASE_IMAGE="hexpm/elixir:1.17.2-erlang-27.0.1-ubuntu-focal-20240530" + +# Pre-stages for base image variants + +FROM ${BASE_IMAGE} AS base-cpu + +FROM ${BASE_IMAGE} AS base-cuda + +ARG CUDA_VERSION +ARG CUDNN_VERSION + +ARG DEBIAN_FRONTEND=noninteractive + +RUN distro="ubuntu$(. /etc/lsb-release; echo "$DISTRIB_RELEASE" | tr -d '.')" && \ + # Official Docker images use the sbsa packages when targetting arm64. + # See https://gitlab.com/nvidia/container-images/cuda/-/blob/85f465ea3343a2d7f7753a0a838701999ed58a01/dist/12.5.1/ubuntu2204/base/Dockerfile#L12 + arch="$(if [ "$(uname -m)" = "aarch64" ]; then echo "sbsa"; else echo "x86_64"; fi)" && \ + apt-get update && apt-get install -y ca-certificates wget && \ + wget -qO /tmp/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$distro/$arch/cuda-keyring_1.1-1_all.deb && \ + dpkg -i /tmp/cuda-keyring.deb && apt-get update && \ + apt-get install -y git cuda-toolkit-${CUDA_VERSION} libcudnn9-cuda-12=${CUDNN_VERSION}-1 libcudnn9-dev-cuda-12=${CUDNN_VERSION}-1 && \ + apt-get clean -y && rm -rf /var/lib/apt/lists/* + +FROM ${BASE_IMAGE} AS base-rocm + +ARG ROCM_VERSION + +ARG DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates curl gnupg && \ + distro="$(. /etc/lsb-release && echo "$DISTRIB_CODENAME")" && \ + curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \ + echo "deb [arch=amd64] https://repo.radeon.com/rocm/apt/${ROCM_VERSION}/ $distro main" | tee /etc/apt/sources.list.d/rocm.list && \ + printf 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600\n' | tee /etc/apt/preferences.d/rocm-pin-600 && \ + apt-get update && \ + apt-get install -y rocm-dev rocm-libs && \ + apt-get clean -y && rm -rf /var/lib/apt/lists/* + +ENV ROCM_PATH "/opt/rocm-${ROCM_VERSION}.0" + +FROM base-${VARIANT} + +# Set the missing UTF-8 locale, otherwise Elixir warns +ENV LC_ALL C.UTF-8 + +# Make sure installing packages (like tzdata) doesn't prompt for configuration +ARG DEBIAN_FRONTEND=noninteractive + +# We need to install "add-apt-repository" first +RUN apt-get update && apt-get install -y software-properties-common && \ + # Add repository with the latest git version + add-apt-repository ppa:git-core/ppa && \ + # Install basic system dependencies + apt-get update && apt-get install -y ca-certificates curl git unzip wget && \ + apt-get clean -y && rm -rf /var/lib/apt/lists/* + +# Install Bazel using Bazelisk (works for both amd and arm) +RUN wget -O bazel "https://github.com/bazelbuild/bazelisk/releases/download/v1.18.0/bazelisk-linux-$(dpkg --print-architecture)" && \ + chmod +x bazel && \ + mv bazel /usr/local/bin/bazel + +ENV USE_BAZEL_VERSION 6.5.0 + +# Install Python and the necessary global dependencies +RUN apt-get update && apt-get install -y python3 python3-pip && \ + ln -s /usr/bin/python3 /usr/bin/python && \ + python -m pip install --upgrade pip numpy && \ + apt-get clean -y && rm -rf /var/lib/apt/lists/* + +# Setup project files + +ARG XLA_TARGET + +ENV XLA_TARGET=${XLA_TARGET} +ENV XLA_CACHE_DIR=/build +ENV XLA_BUILD=true + +COPY mix.exs mix.lock ./ +RUN mix deps.get + +COPY lib lib +COPY Makefile ./ +COPY extension extension + +CMD [ "mix", "compile" ] diff --git a/builds/README.md b/builds/README.md index 922fabd..955ccf5 100644 --- a/builds/README.md +++ b/builds/README.md @@ -4,7 +4,7 @@ This directory contains Docker-based automated builds to run off-CI. ## Usage -Run the build script, passing one of the defined variants. +Run the build script, passing one of the defined targets. ```shell ./build.sh cuda12 diff --git a/builds/build.sh b/builds/build.sh index 1bdcfb0..cc71cf2 100755 --- a/builds/build.sh +++ b/builds/build.sh @@ -5,9 +5,9 @@ set -e cd "$(dirname "$0")/.." print_usage_and_exit() { - echo "Usage: $0 " + echo "Usage: $0 " echo "" - echo "Compiles the project inside docker. Available variants: cpu, cuda12, tpu, rocm." + echo "Compiles the project inside docker. Available targets: cpu, cuda12, tpu, rocm." exit 1 } @@ -15,63 +15,47 @@ if [ $# -ne 1 ]; then print_usage_and_exit fi -# For cuDNN support matrix see [1]. When precompiling, we want to use -# the lowest cuDNN that supports the given CUDA version. -# -# [1]: https://docs.nvidia.com/deeplearning/cudnn/archives/index.html +target="$1" -case "$1" in +case "$target" in "cpu") - docker build -t xla-cpu -f builds/cpu.Dockerfile \ + docker build -t xla-cpu -f builds/Dockerfile \ + --build-arg VARIANT=cuda \ --build-arg XLA_TARGET=cpu \ . - - docker run --rm \ - -v $(pwd)/builds/output/cpu/build:/build \ - -v $(pwd)/builds/output/cpu/.cache:/root/.cache \ - $XLA_DOCKER_FLAGS \ - xla-cpu ;; "tpu") - docker build -t xla-tpu -f builds/cpu.Dockerfile \ + docker build -t xla-tpu -f builds/Dockerfile \ + --build-arg VARIANT=cpu \ --build-arg XLA_TARGET=tpu \ . - - docker run --rm \ - -v $(pwd)/builds/output/tpu/build:/build \ - -v $(pwd)/builds/output/tpu/.cache:/root/.cache \ - $XLA_DOCKER_FLAGS \ - xla-tpu ;; "cuda12") - docker build -t xla-cuda12 -f builds/cuda.Dockerfile \ - --build-arg CUDA_VERSION=12.1.0 \ - --build-arg CUDNN_VERSION=8.9.0 \ + docker build -t xla-cuda12 -f builds/Dockerfile \ + --build-arg VARIANT=cuda \ + --build-arg CUDA_VERSION=12-3 \ + --build-arg CUDNN_VERSION=9.1.1.17 \ --build-arg XLA_TARGET=cuda12 \ . - - docker run --rm \ - -v $(pwd)/builds/output/cuda12/build:/build \ - -v $(pwd)/builds/output/cuda12/.cache:/root/.cache \ - $XLA_DOCKER_FLAGS \ - xla-cuda12 ;; "rocm") - docker build -t xla-rocm -f builds/rocm.Dockerfile \ + docker build -t xla-rocm -f builds/Dockerfile \ + --build-arg VARIANT=rocm \ + --build-arg ROCM_VERSION=6.0 \ --build-arg XLA_TARGET=rocm \ . - - docker run --rm \ - -v $(pwd)/builds/output/rocm/build:/build \ - -v $(pwd)/builds/output/rocm/.cache:/root/.cache \ - $XLA_DOCKER_FLAGS \ - xla-rocm ;; *) print_usage_and_exit ;; esac + +docker run --rm \ + -v $(pwd)/builds/output/$target/build:/build \ + -v $(pwd)/builds/output/$target/.cache:/root/.cache \ + $XLA_DOCKER_FLAGS \ + xla-$target diff --git a/builds/cpu.Dockerfile b/builds/cpu.Dockerfile deleted file mode 100644 index 923b424..0000000 --- a/builds/cpu.Dockerfile +++ /dev/null @@ -1,43 +0,0 @@ -FROM hexpm/elixir:1.15.4-erlang-26.0.2-ubuntu-focal-20230126 AS elixir - -# Set the missing UTF-8 locale, otherwise Elixir warns -ENV LC_ALL C.UTF-8 - -# Make sure installing packages (like tzdata) doesn't prompt for configuration -ENV DEBIAN_FRONTEND noninteractive - -# We need to install "add-apt-repository" first -RUN apt-get update && apt-get install -y software-properties-common && \ - # Add repository with the latest git version - add-apt-repository ppa:git-core/ppa && \ - # Install basic system dependencies - apt-get update && apt-get install -y ca-certificates curl git unzip wget - -# Install Bazel using Bazelisk (works for both amd and arm) -RUN wget -O bazel "https://github.com/bazelbuild/bazelisk/releases/download/v1.18.0/bazelisk-linux-$(dpkg --print-architecture)" && \ - chmod +x bazel && \ - mv bazel /usr/local/bin/bazel - -ENV USE_BAZEL_VERSION 6.1.0 - -# Install Python and the necessary global dependencies -RUN apt-get install -y python3 python3-pip && \ - ln -s /usr/bin/python3 /usr/bin/python && \ - python -m pip install --upgrade pip numpy - -# --- - -ARG XLA_TARGET - -ENV XLA_TARGET=${XLA_TARGET} -ENV XLA_CACHE_DIR=/build -ENV XLA_BUILD=true - -COPY mix.exs mix.lock ./ -RUN mix deps.get - -COPY lib lib -COPY Makefile ./ -COPY extension extension - -CMD [ "mix", "compile" ] diff --git a/builds/cuda.Dockerfile b/builds/cuda.Dockerfile deleted file mode 100644 index 1e2794c..0000000 --- a/builds/cuda.Dockerfile +++ /dev/null @@ -1,66 +0,0 @@ -ARG CUDA_VERSION - -FROM hexpm/elixir:1.15.4-erlang-26.0.2-ubuntu-focal-20230126 AS elixir - -FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04 - -ARG CUDNN_VERSION - -# Set the missing UTF-8 locale, otherwise Elixir warns -ENV LC_ALL C.UTF-8 - -# Make sure installing packages (like tzdata) doesn't prompt for configuration -ENV DEBIAN_FRONTEND noninteractive - -# We need to install "add-apt-repository" first -RUN apt-get update && apt-get install -y software-properties-common && \ - # Add repository with the latest git version - add-apt-repository ppa:git-core/ppa && \ - # Install basic system dependencies - apt-get update && apt-get install -y ca-certificates curl git unzip wget - -# Install a specific cuDNN version over the default one -RUN cuda_version="${CUDA_VERSION}" && \ - cudnn_version="${CUDNN_VERSION}" && \ - cudnn_package_version="$(apt-cache madison libcudnn8 | grep -o "${cudnn_version}.*-1+cuda${cuda_version%.*}")" && \ - apt-get install -y --allow-downgrades --allow-change-held-packages libcudnn8=$cudnn_package_version libcudnn8-dev=$cudnn_package_version - -# Install Bazel using Bazelisk (works for both amd and arm) -RUN wget -O bazel "https://github.com/bazelbuild/bazelisk/releases/download/v1.18.0/bazelisk-linux-$(dpkg --print-architecture)" && \ - chmod +x bazel && \ - mv bazel /usr/local/bin/bazel - -ENV USE_BAZEL_VERSION 6.5.0 - -# Install Python and the necessary global dependencies -RUN apt-get install -y python3 python3-pip && \ - ln -s /usr/bin/python3 /usr/bin/python && \ - python -m pip install --upgrade pip numpy - -# Install Erlang and Elixir - -# Erlang runtime dependencies, see https://github.com/hexpm/bob/blob/3b5721dccdfe9d59766f374e7b4fb7fb8a7c720e/priv/scripts/docker/erlang-ubuntu-focal.dockerfile#L41-L45 -RUN apt-get install -y --no-install-recommends libodbc1 libssl1.1 libsctp1 - -# We copy the top-level directory first to preserve symlinks in /usr/local/bin -COPY --from=elixir /usr/local /usr/ELIXIR_LOCAL -RUN cp -r /usr/ELIXIR_LOCAL/lib/* /usr/local/lib && \ - cp -r /usr/ELIXIR_LOCAL/bin/* /usr/local/bin && \ - rm -rf /usr/ELIXIR_LOCAL - -# --- - -ARG XLA_TARGET - -ENV XLA_TARGET=${XLA_TARGET} -ENV XLA_CACHE_DIR=/build -ENV XLA_BUILD=true - -COPY mix.exs mix.lock ./ -RUN mix deps.get - -COPY lib lib -COPY Makefile ./ -COPY extension extension - -CMD [ "mix", "compile" ] diff --git a/builds/rocm.Dockerfile b/builds/rocm.Dockerfile deleted file mode 100644 index 676350c..0000000 --- a/builds/rocm.Dockerfile +++ /dev/null @@ -1,60 +0,0 @@ -FROM hexpm/elixir:1.15.4-erlang-26.0.2-ubuntu-focal-20230126 AS elixir - -FROM rocm/dev-ubuntu-20.04:6.0-complete - -# Set the missing UTF-8 locale, otherwise Elixir warns -ENV LC_ALL C.UTF-8 - -# Make sure installing packages (like tzdata) doesn't prompt for configuration -ENV DEBIAN_FRONTEND noninteractive - -# We need to install "add-apt-repository" first -RUN apt-get update && apt-get install -y software-properties-common && \ - # Add repository with the latest git version - add-apt-repository ppa:git-core/ppa && \ - # Install basic system dependencies - apt-get update && apt-get install -y ca-certificates curl git unzip wget - -# Install Bazel using Bazelisk (works for both amd and arm) -RUN wget -O bazel "https://github.com/bazelbuild/bazelisk/releases/download/v1.18.0/bazelisk-linux-$(dpkg --print-architecture)" && \ - chmod +x bazel && \ - mv bazel /usr/local/bin/bazel - -ENV USE_BAZEL_VERSION 6.5.0 - -# Install Python and the necessary global dependencies -RUN apt-get install -y python3 python3-pip && \ - ln -s /usr/bin/python3 /usr/bin/python && \ - python -m pip install --upgrade pip numpy - -# Install Erlang and Elixir - -# Erlang runtime dependencies, see https://github.com/hexpm/bob/blob/3b5721dccdfe9d59766f374e7b4fb7fb8a7c720e/priv/scripts/docker/erlang-ubuntu-focal.dockerfile#L41-L45 -RUN apt-get install -y --no-install-recommends libodbc1 libssl1.1 libsctp1 - -# We copy the top-level directory first to preserve symlinks in /usr/local/bin -COPY --from=elixir /usr/local /usr/ELIXIR_LOCAL -RUN cp -r /usr/ELIXIR_LOCAL/lib/* /usr/local/lib && \ - cp -r /usr/ELIXIR_LOCAL/bin/* /usr/local/bin && \ - rm -rf /usr/ELIXIR_LOCAL - -# --- - -ENV ROCM_PATH "/opt/rocm-6.0.0" - -# --- - -ARG XLA_TARGET - -ENV XLA_TARGET=${XLA_TARGET} -ENV XLA_CACHE_DIR=/build -ENV XLA_BUILD=true - -COPY mix.exs mix.lock ./ -RUN mix deps.get - -COPY lib lib -COPY Makefile ./ -COPY extension extension - -CMD [ "mix", "compile" ]