-
Notifications
You must be signed in to change notification settings - Fork 56
/
Copy pathDockerfile.base
237 lines (196 loc) · 8.77 KB
/
Dockerfile.base
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
# syntax=docker/dockerfile:1-labs
ARG BASE_IMAGE=nvidia/cuda:12.6.3-devel-ubuntu24.04
ARG GIT_USER_NAME="JAX Toolbox"
ARG GIT_USER_EMAIL=jax@nvidia.com
ARG CLANG_VERSION=18
ARG JAX_TOOLBOX_REF
###############################################################################
## Obtain GCP's NCCL TCPx plugin
###############################################################################
FROM us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx:v3.1.10 AS tcpx-installer-amd64
# make a stub arm64 container because GCP does not provide an arm64 version of the plugin
FROM ubuntu AS tcpx-installer-arm64
RUN <<"OUTEREOF" bash -ex
mkdir -p /scripts /var/lib/tcpx/lib64
echo '#!/bin/bash' > /scripts/container_entry.sh
chmod +x /scripts/container_entry.sh
OUTEREOF
FROM tcpx-installer-${TARGETARCH} AS tcpx-installer
RUN /scripts/container_entry.sh install
###############################################################################
## Build base image
###############################################################################
FROM ${BASE_IMAGE}
ARG BASE_IMAGE
ARG GIT_USER_EMAIL
ARG GIT_USER_NAME
ARG CLANG_VERSION
ARG JAX_TOOLBOX_REF
ENV CUDA_BASE_IMAGE=${BASE_IMAGE}
###############################################################################
## Install Python and essential tools
###############################################################################
RUN <<"EOF" bash -ex
export DEBIAN_FRONTEND=noninteractive
export TZ=America/Los_Angeles
apt-get update
apt_packages=(
# generic
bat
build-essential
checkinstall
cmake
curl
git
gnupg
liblzma-dev
python-is-python3
python3-pip
rsync
vim
wget
jq
# llvm.sh
lsb-release
software-properties-common
# GCP autoconfig
pciutils hwloc bind9-host
)
if [[ $(dpkg --print-architecture) == arm64 ]]; then
# h5py: The newest release of of h5py (3.11.0) does not include ARM wheels and causes pip to build h5py.
# These installs ensure that 3.11.0, or any future version missing ARM, can be built.
# Related: https://github.com/h5py/h5py/issues/2408
apt_packages+=(pkg-config libhdf5-dev)
fi
apt-get install -y ${apt_packages[@]}
# Install LLVM/Clang
bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" -- ${CLANG_VERSION}
# Make sure that clang and clang++ point to the new version. This list is based
# on the symlinks installed by the `clang` (as opposed to `clang-14`) and `lld`
# (as opposed to `lld-14`) packages available in Ubuntu 22.04. 100 is an
# arbitrary priority.
update-alternatives --verbose \
--install /usr/bin/clang clang /usr/lib/llvm-${CLANG_VERSION}/bin/clang 100 \
--slave /usr/bin/asan_symbolize asan_symbolize /usr/bin/asan_symbolize-${CLANG_VERSION} \
--slave /usr/bin/clang++ clang++ /usr/lib/llvm-${CLANG_VERSION}/bin/clang++ \
--slave /usr/bin/ld.lld ld.lld /usr/lib/llvm-${CLANG_VERSION}/bin/lld \
--slave /usr/bin/lld lld /usr/lib/llvm-${CLANG_VERSION}/bin/lld \
--slave /usr/bin/lld-link lld-link /usr/lib/llvm-${CLANG_VERSION}/bin/lld \
--slave /usr/bin/wasm-ld wasm-ld /usr/lib/llvm-${CLANG_VERSION}/bin/lld
# Make sure that any later attempt to install `clang` or `lld` will fail.
cat >/etc/apt/preferences.d/no-unversioned-clang-lld <<EOL
# LLVM is installed from apt.llvm.org using versioned packages, whereas the
# unversioned clang and lld packages come from Ubuntu and refer to older
# versions of LLVM. Please use versioned packages in this container.
Package: clang
Pin: release *
Pin-Priority: -1
Package: lld
Pin: release *
Pin-Priority: -1
EOL
apt-get clean
rm -rf /var/lib/apt/lists/*
# There are several python packages (in the list below) that are installed with OS
# package manager (the run of `apt-get install` above) and can not be uninstall
# using pip (in pip-finalize.sh script) during JAX installation. Remove then in
# advance to avoid JAX installation issue.
remove_packages=(
python3-gi
software-properties-common
lsb-release
python3-yaml
python3-pygments
)
apt-get remove -y ${remove_packages[@]}
apt-get autoremove -y # removes python3-blinker which conflicts with pip-compile in JAX
EOF
RUN <<"EOF" bash -ex
git config --global user.name "${GIT_USER_NAME}"
git config --global user.email "${GIT_USER_EMAIL}"
EOF
RUN mkdir -p /opt/pip-tools.d /opt/pip-tools-post-install.d
ADD --chmod=777 \
git-clone.sh \
pip-finalize.sh \
/usr/local/bin/
RUN wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) -O /usr/local/bin/yq && \
chmod 777 /usr/local/bin/yq
RUN git clone -b 23.3.1 https://github.com/pypa/pip.git /opt/pip
# Patch is specific to 23.3.1
# Generated via: "git diff > pip-vcs-equivalency.patch"
ADD pip-vcs-equivalency.patch /opt/pip/
RUN <<EOF bash -e -x
cd /opt/pip
git apply </opt/pip/pip-vcs-equivalency.patch
git add -u
git commit -m 'Adds JAX_TOOLBOX_VCS_EQUIVALENCY as a trigger to treat all github VCS installs for a package as equivalent. The spec of the last encountered version will be used'
EOF
# install all python packages system-wide.
ENV PIP_BREAK_SYSTEM_PACKAGES=1
# An extra flag `--ignore-installed` is added below, because of the following reason:
# after upgrading to ver 23.3.1 (from /opt/pip) `pip` tries to uninstall itself (default pip-24.0)
# and fails due to pip-24.0 has been installed with system tool `apt` but not `python`. So we keep
# both pip-24.0 and pip-23.3.1 in the system, but use 23.3.1 with equivalency patch (see above).
RUN pip install --upgrade --ignore-installed --no-cache-dir -e /opt/pip pip-tools && rm -rf ~/.cache/*
###############################################################################
## Install TCPx
###############################################################################
ENV TCPX_LIBRARY_PATH=/usr/local/tcpx/lib64
COPY --from=tcpx-installer /var/lib/tcpx/lib64 ${TCPX_LIBRARY_PATH}
###############################################################################
## Install the latest versions of Nsight Systems and Nsight Compute
###############################################################################
ADD install-nsight.sh /usr/local/bin
RUN install-nsight.sh
###############################################################################
## Install cuDNN
###############################################################################
ADD install-cudnn.sh /usr/local/bin
RUN install-cudnn.sh
###############################################################################
## Install NCCL
###############################################################################
ADD install-nccl.sh /usr/local/bin
RUN install-nccl.sh
###############################################################################
## RoCE and InfiniteBand support
###############################################################################
ADD install-ofed.sh /usr/local/bin
RUN install-ofed.sh
##############################################################################
## Amazon EFA support (need to run it inside container separately)
##############################################################################
ADD --chmod=777 \
install-efa.sh \
test-aws-efa.sh \
/usr/local/bin/
ENV LD_LIBRARY_PATH=/opt/amazon/efa/lib:${LD_LIBRARY_PATH}
ENV PATH=/opt/amazon/efa/bin:${PATH}
##############################################################################
## NCCL sanity check utility
##############################################################################
ADD install-nccl-sanity-check.sh /usr/local/bin
ADD nccl-sanity-check.cu /opt
RUN install-nccl-sanity-check.sh
ADD jax-nccl-test parallel-launch /usr/local/bin/
###############################################################################
## Add the systemcheck to the entrypoint.
###############################################################################
COPY check-shm.sh /opt/nvidia/entrypoint.d/
###############################################################################
## Add the GCP - TCPX check to the entrypoint.
###############################################################################
# TODO(chaserileyroberts): Reenable once fully tested on GCP.
# COPY gcp-autoconfig.sh /opt/nvidia/entrypoint.d/
###############################################################################
## Install the nsys-jax JAX/XLA-aware profiling scripts, patch Nsight Systems
###############################################################################
ADD install-nsys-jax.sh /usr/local/bin
RUN install-nsys-jax.sh ${JAX_TOOLBOX_REF}
###############################################################################
## Copy manifest file to the container
###############################################################################
ENV MANIFEST_FILE="/opt/manifest.d/manifest.yaml"
ADD manifest.yaml create-distribution.sh bump.sh /opt/manifest.d/
COPY patches/ /opt/manifest.d/patches/