-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathparameterized_nccl_build.sh
191 lines (144 loc) · 6.08 KB
/
parameterized_nccl_build.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# Script to build nccl, nccl-test and aws-ofi for specific version of nccl
#
# Paths specialized for DLAMI 23 which has cuda-10 and openfabric enabled mpi
# Applies ~/aws-ofi-nccl.patch if present.
#
#
# Parameters:
# NCCL_VERSION_TAG: for folder name, all results go to
# FOLDER_ROOT=~/nccl/nccl-$NCCL_VERSION_TAG
# GIT_CHECKOUT_COMMAND: command called after git clone, right before building
# NCCL_WIPE_PREVIOUS_BUILD: if set, will get rid of previous build artifacts
#
# Result:
# $FOLDER_ROOT/nccl/build/lib/libnccl.so
# $FOLDER_ROOT/aws-ofi-nccl/install/{bin, lib}
# $FOLDER_ROOT/nccl-tests/build/all_gather_perf
#
# Examples:
# NCCL_VERSION_TAG=2.3.7
# GIT_CHECKOUT_CMD="git checkout v2.3.7-1"
# source parameterized_nccl_build.sh
#
# NCCL_VERSION_TAG=2.4.7
# GIT_CHECKOUT_CMD="git checkout v2.4.7-1"
# source parameterized_nccl_build.sh
#
# NCCL_VERSION_TAG=2.4.7ms0
# GIT_CHECKOUT_CMD="git checkout dev/kwen/multi-socket"
# source parameterized_nccl_build.sh
#
# Alternative instructions (https://gist.github.com/indhub/212525192df033db95d269efba31c15a)
set -e
pushd .
# remove existing MPI installations
sudo mv /usr/local/mpi /usr/local/mpi.hidden || echo "ignoring"
# mv $HOME/anaconda3/lib $HOME/anaconda3/lib.hidden # this has MPI, but needed to run python
export FOLDER_ROOT=~/nccl/nccl-$NCCL_VERSION_TAG
rm -Rf $FOLDER_ROOT || echo ignoring
# PyTorch install asks for NCCL_ROOT_DIR, contains build, README.md, src
# export NCCL_ROOT_DIR=$FOLDER_ROOT/nccl
# CUDA_HOME is used for nccl:nccl-tests:aws-ofi, contains bin, lib, include
export CUDA_HOME=/usr/local/cuda-10.0
export EFA_HOME=/opt/amazon/efa
# MPI_HOME is used for aws-ofi:nccl-test, contains bin, lib, include
#export MPI_HOME=$HOME/anaconda3
export MPI_HOME=$EFA_HOME
# NCCL_HOME is used for aws-ofi:nccl-tests, contains include, lib
export NCCL_HOME=$FOLDER_ROOT/nccl/build
logtag="parameterized_nccl_build.sh: "
# export LD_LIBRARY_PATH=$FOLDER_ROOT/aws-ofi-nccl/install/lib/:$NCCL_HOME/lib:$CUDA_HOME/lib64:$EFA_HOME/lib64:/lib64:$LD_LIBRARY_PATH
################################################################################
# EFA
################################################################################
wget -q https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-latest.tar.gz && tar -xf aws-efa-installer-latest.tar.gz && cd aws-efa-installer && chmod +x efa_installer.sh && sudo ./efa_installer.sh -y
cd /usr/local/lib
sudo rm libmpi.so || echo ignoring
sudo ln -s /opt/amazon/efa/lib64/libmpi.so ./libmpi.so
if [ -z ${NCCL_VERSION_TAG+x} ]; then
echo "$logtag Error: Must set NCCL_VERSION_TAG"
exit
fi
if [ -z ${GIT_CHECKOUT_CMD+x} ]; then
echo "$logtag Error: Must set GIT_CHECKOUT_CMD"
exit
fi
if [ -z ${NCCL_REUSE_PREVIOUS_BUILD+x} ]; then
echo "NCCL_WIPE_PREVIOUS_BUILD is set, reusing previous build"
sudo rm -Rf $FOLDER_ROOT
else
echo "NCCL_REUSE_PREVIOUS_BUILD is set, building from scratch"
fi
################################################################################
# NCCL
################################################################################
if [ -z ${GIT_CHECKOUT_CMD+x} ]; then
GIT_CHECKOUT_CMD="skipping checkout"
echo "$logtag Using default git checkout cmd $GIT_CHECKOUT_CMD"
else
echo "$logtag Using existing git checkout cmd $GIT_CHECKOUT_CMD"
fi
echo "$logtag Installing nccl"
mkdir -p $FOLDER_ROOT && cd $FOLDER_ROOT
git clone https://github.com/NVIDIA/nccl.git || echo "exists"
cd nccl
$GIT_CHECKOUT_CMD
unset NVCC_GENCODE
# Only compile for Pascal/cuda 9+
# https://github.com/NVIDIA/nccl/issues/165
# remove 3.0 from list of supported architectures.
export NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=compute_70"
make -j src.build
################################################################################
# AWS NCCL OFI
################################################################################
echo "$logtag Installing aws-ofi-nccl"
mkdir -p $FOLDER_ROOT && cd $FOLDER_ROOT
git clone https://github.com/aws/aws-ofi-nccl.git || echo exists
cd aws-ofi-nccl
if [ -f "$HOME/aws-ofi-nccl.patch" ]; then
echo "using ~/aws-ofi-nccl.patch version of aws-ofi-nccl"
# git apply fails with "patch does not apply", use patch command instead
# git apply ~/aws-ofi-nccl.patch
patch -p1 < $HOME/aws-ofi-nccl.patch
else
echo "$logtag using master version of aws-ofi-nccl"
# git apply ~/aws-ofi-nccl.patch
fi
sudo yum install libudev-devel -y
./autogen.sh
mkdir install
./configure --prefix=$FOLDER_ROOT/aws-ofi-nccl/install \
--with-mpi=$MPI_HOME \
--with-libfabric=$EFA_HOME \
--with-nccl=$NCCL_HOME \
--with-cuda=$CUDA_HOME
LDFLAGS="-L/opt/amazon/efa/lib64" make MPI=1 MPI_HOME=/opt/amazon/efa/ CUDA_HOME=$CUDA_HOME NCCL_HOME=$NCCL_HOME
sudo make install
# && make && make install
################################################################################
# nccl-tests
################################################################################
echo "Installing nccl-tests"
mkdir -p $FOLDER_ROOT && cd $FOLDER_ROOT
git clone https://github.com/NVIDIA/nccl-tests.git || echo "exists"
cd nccl-tests
# TODO(y): is this same as as MPI_HOME, or are there extra things in anaconda3?
#export LD_LIBRARY_PATH=$HOME/anaconda3/lib/:$LD_LIBRARY_PATH:$FOLDER_ROOT/aws-ofi-nccl/install/lib:
LDFLAGS="-L/opt/amazon/efa/lib64" make MPI=1 MPI_HOME=$MPI_HOME CUDA_HOME=$CUDA_HOME NCCL_HOME=$NCCL_HOME
# ################################################################################
# # PyTorch
# ################################################################################
# echo "Installing PyTorch"
# mkdir -p $FOLDER_ROOT && cd $FOLDER_ROOT
# git clone --recursive https://github.com/pytorch/pytorch
# cd pytorch
# git fetch
# git checkout v1.1.0
# source activate pytorch_p36
# export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
# export NCCL_ROOT_DIR=$FOLDER_ROOT/nccl
# rm -Rf build
# python setup.py install
popd
set +e