Skip to content

Commit

Permalink
Merge pull request #471 from sony/feature/20230614-combine-hpcx-openmpi
Browse files Browse the repository at this point in the history
Support HPC-X for release docker images
  • Loading branch information
YukioOobuchi authored Jun 28, 2023
2 parents 8751745 + e9ed31c commit 506c928
Show file tree
Hide file tree
Showing 5 changed files with 186 additions and 30 deletions.
12 changes: 12 additions & 0 deletions docker/release/.entrypoint-cuda-mpi.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,18 @@

source /etc/shinit_v2

if [ -d "/opt/mpi/hpcx-v2.12" ]; then
curdir=$PWD
cd /opt/mpi/hpcx-v2.12
. ./hpcx-init.sh
hpcx_load
cd $curdir
unset curdir
else
export PATH=/opt/mpi/bin:$PATH
export LD_LIBRARY_PATH=/opt/mpi/lib:$LD_LIBRARY_PATH
fi

if [ $# -eq 0 ]; then
exec "/bin/bash"
else
Expand Down
12 changes: 12 additions & 0 deletions docker/release/20-nvidia-cuda-compat.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,15 @@
# limitations under the License.

test -f /etc/shinit_v2 && . /etc/shinit_v2

if [ -d "/opt/mpi/hpcx-v2.12" ]; then
curdir=$PWD
cd /opt/mpi/hpcx-v2.12
. ./hpcx-init.sh
hpcx_load
cd $curdir
unset curdir
else
export PATH=/opt/mpi/bin:$PATH
export LD_LIBRARY_PATH=/opt/mpi/lib:$LD_LIBRARY_PATH
fi
84 changes: 54 additions & 30 deletions docker/release/Dockerfile.cuda-mpi
Original file line number Diff line number Diff line change
Expand Up @@ -27,30 +27,57 @@ ARG APT_OPTS
ARG MPI
ARG MPI_OPTS

ENV DEBIAN_FRONTEND noninteractive

RUN eval ${APT_OPTS} && apt-get update
RUN apt-get install -y --no-install-recommends \
RUN eval ${APT_OPTS} \
&& apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
curl \
gfortran \
libibverbs-dev

RUN apt-get install -y --no-install-recommends ca-certificates

RUN mkdir /tmp/openmpi
RUN curl ${CURL_OPTS} https://download.open-mpi.org/release/open-mpi/v${MPI%.*}/openmpi-${MPI}.tar.bz2 -o /tmp/openmpi/openmpi-${MPI}.tar.bz2
RUN tar Cxvf /tmp/openmpi /tmp/openmpi/openmpi-${MPI}.tar.bz2
RUN cd tmp/openmpi/openmpi-${MPI} \
&& ./configure \
--prefix=/opt/openmpi --enable-orterun-prefix-by-default --with-sge ${MPI_OPTS} \
CC=gcc \
CXX=g++ \
F77=gfortran \
FC=gfortran \
&& make -j8 \
&& make install \
&& echo btl_openib_allow_ib = 1 >> /opt/openmpi/etc/openmpi-mca-params.conf
libibverbs-dev \
librdmacm1 \
librdmacm-dev \
libnuma1 \
libnuma-dev \
patch \
&& rm -rf /var/lib/apt/lists/*

COPY hpcx-init.patch /tmp/hpcx-init.patch
COPY hpcx-ompi-etc.patch /tmp/hpcx-ompi-etc.patch
RUN if [ $(echo "${MPI}" | awk -F. '{ printf("%d%02d%02d\n", $1,$2,$3); }') -gt 30106 ]; then \
mkdir /opt/mpi \
&& cd /opt/mpi \
&& curl -LO http://www.mellanox.com/downloads/hpc/hpc-x/v2.12/hpcx-v2.12-gcc-MLNX_OFED_LINUX-5-ubuntu18.04-cuda11-gdrcopy2-nccl2.12-x86_64.tbz \
&& tar -xvf hpcx*.tbz \
&& rm -f hpcx*.tbz \
&& mv hpcx* hpcx-v2.12 \
&& cd hpcx-v2.12 \
&& ./utils/hpcx_rebuild.sh --ompi-extra-config --enable-openib-rdmacm-ibaddr \
&& cp -ap ./hpcx-rebuild/lib/libmca_common_verbs.* ./ompi/lib/ \
&& cp -ap ./hpcx-rebuild/lib/openmpi/mca_btl_openib.* ./ompi/lib/openmpi/ \
&& cp -ap ./hpcx-rebuild/share/openmpi/*verbs* ./ompi/share/openmpi/ \
&& cp -ap ./hpcx-rebuild/share/openmpi/*openib* ./ompi/share/openmpi/ \
&& rm -rf ./sources/openmpi-gitclone ./hpcx_rebuild ./hpcx_rebuild.sh \
&& patch -p1 < /tmp/hpcx-init.patch \
&& patch -p1 < /tmp/hpcx-ompi-etc.patch \
&& chmod og+w ./ompi/etc/* \
&& cd ..; \
else \
mkdir /tmp/openmpi \
&& curl ${CURL_OPTS} https://download.open-mpi.org/release/open-mpi/v${MPI%.*}/openmpi-${MPI}.tar.bz2 -o /tmp/openmpi/openmpi-${MPI}.tar.bz2 \
&& tar Cxvf /tmp/openmpi /tmp/openmpi/openmpi-${MPI}.tar.bz2 \
&& cd tmp/openmpi/openmpi-${MPI} \
&& ./configure \
--prefix=/opt/mpi --enable-orterun-prefix-by-default --with-sge ${MPI_OPTS} \
CC=gcc \
CXX=g++ \
F77=gfortran \
FC=gfortran \
&& make -j8 \
&& make install \
&& echo btl_openib_allow_ib = 1 >> /opt/mpi/etc/openmpi-mca-params.conf; \
fi


FROM nvidia/cuda:${CU1_VER}.${CU2_VER}-cudnn${CUDNN_VER}-runtime-ubuntu18.04

Expand All @@ -63,17 +90,15 @@ ARG CURL_OPTS
ARG WGET_OPTS
ARG APT_OPTS

ENV DEBIAN_FRONTEND noninteractive

RUN eval ${APT_OPTS} \
&& apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \
&& apt-get update \
&& apt-get install -y software-properties-common \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y software-properties-common \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update \
&& apt-get install -y --no-install-recommends \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
bzip2 \
ca-certificates \
curl \
Expand All @@ -83,14 +108,15 @@ RUN eval ${APT_OPTS} \
libdapl2 \
libibmad5 \
librdmacm1 \
libnuma1 \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

################################################## build python from pyenv
ARG NNABLA_VER
RUN eval ${APT_OPTS} \
&& apt update \
&& apt install -y --no-install-recommends \
&& apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
git \
make \
build-essential \
Expand Down Expand Up @@ -147,9 +173,7 @@ RUN eval ${APT_OPTS} \
|| echo "Skip DALI installation (CUDA=${CU1_VER}.${CU2_VER})") \
&& pip install ${PIP_INS_OPTS} --no-cache-dir nnabla-ext-cuda${CU1_VER}${CU2_VER%.?}==${NNABLA_VER} nnabla_converter==${NNABLA_VER}

COPY --from=openmpi /opt/openmpi /opt/openmpi
ENV PATH /opt/openmpi/bin:$PATH
ENV LD_LIBRARY_PATH /opt/openmpi/lib:$LD_LIBRARY_PATH
COPY --from=openmpi /opt/mpi /opt/mpi

# cuda compat driver support
COPY cudalibcheck /usr/local/bin/cudalibcheck
Expand Down
89 changes: 89 additions & 0 deletions docker/release/hpcx-init.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
--- old/hpcx-init-ompi.sh 2023-06-14 19:39:44.791319808 +0900
+++ new/hpcx-init-ompi.sh 2023-06-14 19:42:08.339318226 +0900
@@ -1,5 +1,5 @@
#!/bin/bash
-mydir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+mydir="/opt/mpi/hpcx-v2.12"

export HPCX_DIR=$mydir
export HPCX_UCX_DIR=${HPCX_DIR}/ucx
@@ -20,13 +20,18 @@
export OSHMEM_HOME=${HPCX_MPI_DIR}
export SHMEM_HOME=${HPCX_MPI_DIR}

-function hpcx_load()
-{
- PATH=${PATH:-""}
- OPAL_PREFIX=${OPAL_PREFIX:-""}
+hpcx_load() {
+ if [ "x$PATH" = "x" ]; then
+ export PATH=""
+ fi
+ if [ "x$OPAL_PREFIX" = "x" ]; then
+ export OPAL_PREFIX=""
+ fi
export OLD_PATH=$PATH
export OLD_OPAL_PREFIX=${OPAL_PREFIX}
+ export OLD_PMIX_INSTALL_PREFIX=${PMIX_INSTALL_PREFIX}
export OPAL_PREFIX=${HPCX_MPI_DIR}
+ export PMIX_INSTALL_PREFIX=${HPCX_MPI_DIR}
export PATH=${HPCX_MPI_DIR}/bin:$PATH
export PATH=${HPCX_UCX_DIR}/bin:$PATH
export PATH=${HPCX_UCC_DIR}/bin:$PATH
@@ -34,7 +39,9 @@
export PATH=${HPCX_CLUSTERKIT_DIR}/bin:$PATH
export PATH=${HPCX_SHARP_DIR}/bin:$PATH

- LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-""}
+ if [ "x$LD_LIBRARY_PATH" = "x" ]; then
+ export LD_LIBRARY_PATH=""
+ fi
export OLD_LD_LIBRARY_PATH=${LD_LIBRARY_PATH}
export LD_LIBRARY_PATH=${HPCX_MPI_DIR}/lib:${LD_LIBRARY_PATH}
export LD_LIBRARY_PATH=${HPCX_HCOLL_DIR}/lib:${LD_LIBRARY_PATH}
@@ -45,7 +52,9 @@
export LD_LIBRARY_PATH=${HPCX_UCC_DIR}/lib/ucc:${LD_LIBRARY_PATH}
export LD_LIBRARY_PATH=${HPCX_NCCL_RDMA_SHARP_PLUGIN_DIR}/lib:${LD_LIBRARY_PATH}

- LIBRARY_PATH=${LIBRARY_PATH:-""}
+ if [ "x$LIBRARY_PATH" = "x" ]; then
+ export LIBRARY_PATH=""
+ fi
export OLD_LIBRARY_PATH=${LIBRARY_PATH}
export LIBRARY_PATH=${HPCX_MPI_DIR}/lib:${LIBRARY_PATH}
export LIBRARY_PATH=${HPCX_HCOLL_DIR}/lib:${LIBRARY_PATH}
@@ -55,7 +64,9 @@
export LIBRARY_PATH=${HPCX_MPI_DIR}/lib:${LIBRARY_PATH}
export LIBRARY_PATH=${HPCX_NCCL_RDMA_SHARP_PLUGIN_DIR}/lib:${LIBRARY_PATH}

- CPATH=${CPATH:-""}
+ if [ "x$CPATH" = "x" ]; then
+ export CPATH=""
+ fi
export OLD_CPATH=$CPATH
export CPATH=${HPCX_HCOLL_DIR}/include:$CPATH
export CPATH=${HPCX_SHARP_DIR}/include:$CPATH
@@ -63,18 +74,20 @@
export CPATH=${HPCX_UCC_DIR}/include:$CPATH
export CPATH=${HPCX_MPI_DIR}/include:$CPATH

- PKG_CONFIG_PATH=${PKG_CONFIG_PATH:-""}
+ if [ "x$PKG_CONFIG_PATH" = "x" ]; then
+ export PKG_CONFIG_PATH=""
+ fi
export OLD_PKG_CONFIG_PATH=${PKG_CONFIG_PATH}
export PKG_CONFIG_PATH=${HPCX_MPI_DIR}/lib/pkgconfig:${PKG_CONFIG_PATH}
export PKG_CONFIG_PATH=${HPCX_HCOLL_DIR}/lib/pkgconfig:${HPCX_SHARP_DIR}/lib/pkgconfig:${HPCX_UCX_DIR}/lib/pkgconfig:${PKG_CONFIG_PATH}
}

-function hpcx_unload()
-{
+hpcx_unload() {
export PATH=${OLD_PATH}
export LD_LIBRARY_PATH=${OLD_LD_LIBRARY_PATH}
export LIBRARY_PATH=${OLD_LIBRARY_PATH}
export OPAL_PREFIX=${OLD_OPAL_PREFIX}
+ export PMIX_INSTALL_PREFIX=${OLD_PMIX_INSTALL_PREFIX}
export CPATH=${OLD_CPATH}
export PKG_CONFIG_PATH=${OLD_PKG_CONFIG_PATH}
for var in $(env|grep HPCX_|cut -f1 -d=) MPI_HOME OSHMEM_HOME SHMEM_HOME OMPI_HOME; do
19 changes: 19 additions & 0 deletions docker/release/hpcx-ompi-etc.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
--- old/ompi/etc/openmpi-mca-params.conf 2023-06-26 20:44:30.747239585 +0900
+++ new/ompi/etc/openmpi-mca-params.conf 2023-06-26 20:49:55.143401095 +0900
@@ -59,8 +59,7 @@
# parameters available and their default values.
#rmaps_base_mapping_policy = dist:auto
coll = ^ml
-hwloc_base_binding_policy = core
-btl = self
+hwloc_base_binding_policy = none # core
pml_ucx_tls = any
pml_ucx_devices = any
opal_common_ucx_opal_mem_hooks = 0
@@ -92,4 +91,6 @@
coll_tuned_scatter_large_msg = 250000
coll_tuned_scatter_min_procs = 1048510
coll_tuned_scatter_algorithm_max_requests = 64
+coll_hcoll_enable = 0

+btl_openib_allow_ib = 1

0 comments on commit 506c928

Please sign in to comment.