-
Notifications
You must be signed in to change notification settings - Fork 486
/
common.sh
executable file
·226 lines (196 loc) · 7.67 KB
/
common.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
#!/bin/bash
set -ex
# See Note [Keep Going]
CONTINUE_ON_ERROR=false
if [[ "$CONTINUE_ON_ERROR" == "1" ]]; then
set +e
fi
# System default cmake 3.10 cannot find mkl, so point it to the right place.
# CMAKE_PREFIX_PATH will default to (in this order):
# 1. CMAKE_PREFIX_PATH (if it exists)
# 2. CONDA_PREFIX (if it exists)
# 3. The conda install directory (if it exists)
export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH:-${CONDA_PREFIX:-"$(dirname $(which conda))/../"}}
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$(python3-config --prefix)/lib"
echo $LD_LIBRARY_PATH
function clone_pytorch() {
PYTORCH_DIR=$1
XLA_DIR=$2
git clone --quiet https://github.com/pytorch/pytorch.git "$PYTORCH_DIR"
cp -r "$PWD" "$XLA_DIR"
}
function apply_patches() {
# assumes inside pytorch dir
./xla/scripts/apply_patches.sh
}
function rebase_pull_request_on_target_branch() {
# TODO: directly use ENV_VAR when CircleCi exposes base branch.
# Try rebasing on top of base (dest) branch first.
# This allows us to pickup the latest fix for PT-XLA breakage.
# Also it might improve build time as we have warm cache.
git config --global user.email "circleci.ossci@gmail.com"
git config --global user.name "CircleCI"
sudo apt-get update && sudo apt-get -qq install jq
# Only rebase on runs triggered by PR checks not post-submits.
if [[ ! -z "${CIRCLE_PULL_REQUEST}" ]]; then
PR_NUM=$(basename $CIRCLE_PULL_REQUEST)
CIRCLE_PR_BASE_BRANCH=$(curl -s https://api.github.com/repos/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME/pulls/$PR_NUM | jq -r '.base.ref')
git rebase "origin/${CIRCLE_PR_BASE_BRANCH}"
git submodule deinit -f .
git submodule update --init --recursive
fi
}
function checkout_torch_pin_if_available() {
COMMITID_FILE="xla/.torch_pin"
if [ -e "$COMMITID_FILE" ]; then
git checkout $(cat "$COMMITID_FILE")
fi
git submodule update --init --recursive
}
function install_deps_pytorch_xla() {
XLA_DIR=$1
USE_CACHE="${2:-0}"
# Install pytorch deps
pip install sympy
# Install ninja to speedup the build
pip install ninja
# Install libraries required for running some PyTorch test suites
pip install hypothesis
pip install cloud-tpu-client
pip install absl-py
pip install --upgrade "numpy>=1.18.5"
pip install --upgrade numba
# Using the Ninja generator requires CMake version 3.13 or greater
pip install "cmake>=3.13" --upgrade
sudo apt-get -qq update
sudo apt-get -qq install npm nodejs
# Install LCOV and llvm-cov to generate C++ coverage reports
sudo apt-get install -y lcov
# XLA build requires Bazel
# We use bazelisk to avoid updating Bazel version manually.
sudo npm install -g @bazel/bazelisk
# Only unlink if file exists
if [[ -e /usr/bin/bazel ]]; then
sudo unlink /usr/bin/bazel
fi
sudo ln -s "$(command -v bazelisk)" /usr/bin/bazel
# Symnlink the missing cuda headers if exists
CUBLAS_PATTERN="/usr/include/cublas*"
if ls $CUBLAS_PATTERN 1> /dev/null 2>&1; then
sudo ln -s $CUBLAS_PATTERN /usr/local/cuda/include
fi
}
function build_torch_xla() {
XLA_DIR=$1
pushd "$XLA_DIR"
python setup.py install
popd
}
function run_torch_xla_python_tests() {
PYTORCH_DIR=$1
XLA_DIR=$2
USE_COVERAGE="${3:-0}"
pushd $XLA_DIR
echo "Running Python Tests"
if [ "$USE_COVERAGE" != "0" ]; then
pip install coverage==6.5.0 --upgrade
pip install coverage-lcov
./test/run_tests.sh
coverage combine
mkdir lcov && cp .coverage lcov/
coverage-lcov --data_file_path lcov/.coverage
coverage html
cp lcov.info htmlcov/
mv htmlcov ~/
chmod -R 755 ~/htmlcov
else
./test/run_tests.sh
# CUDA tests
if [ -x "$(command -v nvidia-smi)" ]; then
# These tests fail on CUDA with 03/30 TF-pin update (https://github.com/pytorch/xla/pull/4840)
PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
# TODO(xiowei replace gpu with cuda): remove the test below with PJRT_DEVICE=GPU because PJRT_DEVICE=GPU is being deprecated.
PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --auto_wrap_policy type_based --use_small_fake_sample --num_epochs=1
XLA_DISABLE_FUNCTIONALIZATION=1 PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
# Syncfree SGD optimizer tests
if [ -d ./torch_xla/amp/syncfree ]; then
echo "Running Syncfree Optimizer Test"
PJRT_DEVICE=CUDA python test/test_syncfree_optimizers.py
# Following test scripts are mainly useful for
# performance evaluation & comparison among different
# amp optimizers.
# echo "Running ImageNet Test"
# python test/test_train_mp_imagenet_amp.py --fake_data --num_epochs=1
# disabled per https://github.com/pytorch/xla/pull/2809
# echo "Running MNIST Test"
# python test/test_train_mp_mnist_amp.py --fake_data --num_epochs=1
fi
fi
fi
popd
}
function run_torch_xla_cpp_tests() {
PYTORCH_DIR=$1
XLA_DIR=$2
USE_COVERAGE="${3:-0}"
pushd $XLA_DIR
echo "Running C++ Tests on PJRT"
EXTRA_ARGS=""
if [ "$USE_COVERAGE" != "0" ]; then
EXTRA_ARGS="-C"
fi
if [ ! -z "$GCLOUD_SERVICE_KEY_FILE" ]; then
EXTRA_ARGS="$EXTRA_ARGS -R"
fi
if [ "$USE_COVERAGE" != "0" ]; then
# TODO(yeounoh) shard the coverage testing
if [ -x "$(command -v nvidia-smi)" ]; then
PJRT_DEVICE=CUDA test/cpp/run_tests.sh $EXTRA_ARGS -L""
cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/cov1.dat
PJRT_DEVICE=CUDA test/cpp/run_tests.sh -X early_sync -F AtenXlaTensorTest.TestEarlySyncLiveTensors -L"" $EXTRA_ARGS
cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/cov2.dat
lcov --add-tracefile /tmp/cov1.dat -a /tmp/cov2.dat -o /tmp/merged.dat
else
PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/merged.dat
fi
genhtml /tmp/merged.dat -o ~/htmlcov/cpp/cpp_lcov.info
mv /tmp/merged.dat ~/htmlcov/cpp_lcov.info
else
# Shard GPU testing
if [ -x "$(command -v nvidia-smi)" ]; then
PJRT_DEVICE=CUDA test/cpp/run_tests.sh $EXTRA_ARGS -L""
PJRT_DEVICE=CUDA test/cpp/run_tests.sh -X early_sync -F AtenXlaTensorTest.TestEarlySyncLiveTensors -L"" $EXTRA_ARGS
else
PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
fi
fi
popd
}
function run_torch_xla_tests() {
PYTORCH_DIR=$1
XLA_DIR=$2
USE_COVERAGE="${3:-0}"
RUN_CPP="${RUN_CPP_TESTS:0}"
RUN_PYTHON="${RUN_PYTHON_TESTS:0}"
if [ -x "$(command -v nvidia-smi)" ]; then
num_devices=$(nvidia-smi --list-gpus | wc -l)
echo "Found $num_devices GPU devices..."
export GPU_NUM_DEVICES=$num_devices
fi
export PYTORCH_TESTING_DEVICE_ONLY_FOR="xla"
export CXX_ABI=$(python -c "import torch;print(int(torch._C._GLIBCXX_USE_CXX11_ABI))")
# TODO(yeounoh) test coverage workflow is not parallelized.
if [[ -z "$RUN_CPP_TESTS1" && -z "$RUN_CPP_TESTS2" && -z "$RUN_PYTHON_TESTS" || "$USE_COVERAGE" != "0" ]]; then
run_torch_xla_python_tests $PYTORCH_DIR $XLA_DIR $USE_COVERAGE
run_torch_xla_cpp_tests $PYTORCH_DIR $XLA_DIR $USE_COVERAGE
else
# run python and cpp tests separately.
if [[ "$RUN_PYTHON_TESTS" == "python_tests" ]]; then
run_torch_xla_python_tests $PYTORCH_DIR $XLA_DIR $USE_COVERAGE
else
run_torch_xla_cpp_tests $PYTORCH_DIR $XLA_DIR $USE_COVERAGE
fi
fi
}