Skip to content

Commit

Permalink
Fixes failing CI on GPU and HVD (#2357)
Browse files Browse the repository at this point in the history
* Fixes failing CI on GPU and HVD
- Updated nccl to 2.11 and commented out APEX installation

* Added pattern option to run select hvd only tests

* Added a sleep to _hvd_task_with_init
  • Loading branch information
vfdev-5 authored Dec 15, 2021
1 parent 4ddcf29 commit 105e5d6
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 7 deletions.
16 changes: 10 additions & 6 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,10 @@ install_dependencies: &install_dependencies
name: Install dependencies
command: |
docker exec -it pthd pip install -r requirements-dev.txt
export install_apex_cmd='pip install -v --disable-pip-version-check --no-cache-dir git+https://github.com/NVIDIA/apex'
export install_git_apex_cmd="apt-get update && apt-get install -y --no-install-recommends git && ${install_apex_cmd}"
docker exec -it pthd /bin/bash -c "$install_git_apex_cmd"
# Commented APEX installation, https://github.com/pytorch/ignite/issues/2299
# export install_apex_cmd='pip install -v --disable-pip-version-check --no-cache-dir git+https://github.com/NVIDIA/apex'
# export install_git_apex_cmd="apt-get update && apt-get install -y --no-install-recommends git && ${install_apex_cmd}"
# docker exec -it pthd /bin/bash -c "$install_git_apex_cmd"
export install_ignite_cmd='python setup.py install'
docker exec -it pthd /bin/bash -c "$install_ignite_cmd"
Expand All @@ -113,6 +114,9 @@ download_mnist: &download_mnist
- run:
name: Download MNIST
command: |
export install_git_cmd="apt-get update && apt-get install -y --no-install-recommends git"
docker exec -it pthd /bin/bash -c "$install_git_cmd"
export tmp_mnist_dir='/tmp/mnist'
export tests_mnist_dir='/tmp'
export examples_mnist_dir='.'
Expand Down Expand Up @@ -337,17 +341,17 @@ jobs:
# and https://github.com/horovod/horovod/issues/1944#issuecomment-628192778
docker exec -it pthd /bin/bash -c "apt-get update && apt-get install -y git"
docker exec -it pthd /bin/bash -c "git clone --recursive https://github.com/horovod/horovod.git /horovod && cd /horovod && python setup.py sdist"
docker exec -it pthd /bin/bash -c "conda install -y cmake nccl=2.8 -c conda-forge"
docker exec -it pthd /bin/bash -c "conda install -y cmake nccl=2.11 -c conda-forge"
docker exec -it pthd /bin/bash -c 'cd /horovod && HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_NCCL_LINK=SHARED HOROVOD_WITHOUT_MPI=1 HOROVOD_WITH_PYTORCH=1 pip install -v $(ls /horovod/dist/horovod-*.tar.gz) && ldconfig'
docker exec -it pthd horovodrun --check-build
- run:
name: Run 1 Node 2 GPUs Unit Tests
command: |
export test_cmd='bash tests/run_gpu_tests.sh'
export test_cmd='bash tests/run_gpu_tests.sh 1 hvd'
docker exec -it pthd /bin/bash -c "${test_cmd}"
# no CUDA devices Horovod tests
export test_cmd='CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed'
export test_cmd='CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k hvd'
docker exec -it pthd /bin/bash -c "${test_cmd}"
- run:
Expand Down
7 changes: 7 additions & 0 deletions tests/ignite/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,13 @@ def _hvd_task_with_init(func, args):
torch.cuda.set_device(lrank)

func(*args)

# Added a sleep to avoid flaky failures on circle ci
# Sometimes a rank is terminated before final collective
# op is finished.
# https://github.com/pytorch/ignite/pull/2357
time.sleep(2)

hvd.shutdown()


Expand Down
7 changes: 6 additions & 1 deletion tests/run_gpu_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ else
ngpus=$1
fi

pattern=""
if [ -n "$2" ]; then
pattern="-k $2"
fi

set -xeu

pytest --cov ignite --cov-report term-missing --cov-report xml -vvv tests/ -k 'cuda'
Expand All @@ -15,7 +20,7 @@ if [ "${SKIP_DISTRIB_TESTS:-0}" -eq "1" ]; then
exit 0
fi

pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed
pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed ${pattern}


if [ ${ngpus} -gt 1 ]; then
Expand Down

0 comments on commit 105e5d6

Please sign in to comment.