diff --git a/.github/workflows/gpu-hvd-tests.yml b/.github/workflows/gpu-hvd-tests.yml index 6661f46b501..2017cf8acda 100644 --- a/.github/workflows/gpu-hvd-tests.yml +++ b/.github/workflows/gpu-hvd-tests.yml @@ -22,7 +22,7 @@ jobs: gpu-hvd-tests: strategy: matrix: - pytorch-channel: [pytorch, ] + pytorch-channel: [pytorch] fail-fast: false env: DOCKER_IMAGE: "pytorch/conda-builder:cuda12.1" @@ -128,8 +128,8 @@ jobs: # Can't build Horovod with recent pytorch due to pytorch required C++17 standard # and horovod is still using C++14 # HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch] - # Using a similar hack as described here: - # https://github.com/horovod/horovod/issues/3941#issuecomment-1732505345 + # Using a similar hack as described here: + # https://github.com/horovod/horovod/issues/3941#issuecomment-1732505345 git clone --recursive https://github.com/horovod/horovod.git /horovod cd /horovod sed -i "s/CMAKE_CXX_STANDARD 14/CMAKE_CXX_STANDARD 17/g" CMakeLists.txt @@ -152,7 +152,7 @@ jobs: set -xe bash tests/run_gpu_tests.sh 2 hvd - CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k hvd + CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ignite -m distributed -k hvd EOF ) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 92345b3baed..faa84deffd9 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -29,7 +29,7 @@ jobs: REPOSITORY: ${{ github.repository }} PR_NUMBER: ${{ github.event.pull_request.number }} runs-on: linux.8xlarge.nvidia.gpu - timeout-minutes: 45 + timeout-minutes: 85 steps: - name: Clean workspace @@ -121,18 +121,13 @@ jobs: - name: Run GPU Unit Tests continue-on-error: false - run: | - - script=$(cat << EOF - - set -xe - - bash tests/run_gpu_tests.sh 2 - - EOF - ) - - docker exec -t pthd /bin/bash -c "${script}" + uses: nick-fields/retry@v3 + with: + max_attempts: 5 + timeout_minutes: 25 + shell: bash + command: docker exec -t pthd /bin/bash -xec 'tests/run_gpu_tests.sh 2' + new_command_on_retry: docker exec -e USE_LAST_FAILED=1 -t pthd /bin/bash -xec 'tests/run_gpu_tests.sh 2' - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 diff --git a/.github/workflows/hvd-tests.yml b/.github/workflows/hvd-tests.yml index f483d21f38e..35e107f888b 100644 --- a/.github/workflows/hvd-tests.yml +++ b/.github/workflows/hvd-tests.yml @@ -75,9 +75,13 @@ jobs: target_dir: /tmp - name: Run Tests - shell: bash -l {0} - run: | - bash tests/run_cpu_tests.sh + uses: nick-fields/retry@v3 + with: + max_attempts: 5 + timeout_minutes: 15 + shell: bash + command: bash tests/run_cpu_tests.sh + new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_cpu_tests.sh - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 diff --git a/.github/workflows/pytorch-version-tests.yml b/.github/workflows/pytorch-version-tests.yml index e47f8faaa46..2e0ad5e0c98 100644 --- a/.github/workflows/pytorch-version-tests.yml +++ b/.github/workflows/pytorch-version-tests.yml @@ -10,7 +10,7 @@ on: jobs: build: runs-on: ubuntu-latest - timeout-minutes: 45 + timeout-minutes: 85 strategy: max-parallel: 5 fail-fast: false @@ -18,7 +18,7 @@ jobs: python-version: [3.8, 3.9, "3.10"] pytorch-version: [2.1.2, 2.0.1, 1.13.1, 1.12.1, 1.11.0, 1.10.0, 1.9.1, 1.8.1, 1.5.1] - exclude: + exclude: - pytorch-version: 1.5.1 python-version: 3.9 - pytorch-version: 1.5.1 @@ -78,7 +78,7 @@ jobs: pip install -r requirements-dev.txt python setup.py install - # pytorch>=1.9.0,<1.11.0 is using "from setuptools import distutils; distutils.version.LooseVersion" anti-pattern + # pytorch>=1.9.0,<1.11.0 is using "from setuptools import distutils; distutils.version.LooseVersion" anti-pattern # which raises the error: AttributeError: module 'distutils' has no attribute 'version' for setuptools>59 bad_pth_version=$(python -c "import torch; print('.'.join(torch.__version__.split('.')[:2]) in ['1.9', '1.10'])") if [ "${bad_pth_version}" == "True" ]; then @@ -92,9 +92,13 @@ jobs: target_dir: /tmp - name: Run Tests - shell: bash -l {0} - run: | - bash tests/run_cpu_tests.sh "not test_time_profilers" + uses: nick-fields/retry@v3 + with: + max_attempts: 5 + timeout_minutes: 15 + shell: bash + command: bash tests/run_cpu_tests.sh "not test_time_profilers" + new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_cpu_tests.sh "not test_time_profilers" # create-issue: # runs-on: ubuntu-latest diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml index 08eaaf30d8f..ab14ad3c1de 100644 --- a/.github/workflows/tpu-tests.yml +++ b/.github/workflows/tpu-tests.yml @@ -89,13 +89,19 @@ jobs: target_dir: /tmp - name: Run Tests - run: | - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${Python_ROOT_DIR}/lib - export XRT_DEVICE_MAP="CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0" - export XRT_WORKERS="localservice:0;grpc://localhost:40934" - - python -c "import torch_xla; print('torch xla version:', torch_xla.__version__)" - bash tests/run_tpu_tests.sh + uses: nick-fields/retry@v3 + with: + max_attempts: 5 + timeout_minutes: 25 + shell: bash + command: | + python -c "import torch_xla; print('torch xla version:', torch_xla.__version__)" + bash tests/run_tpu_tests.sh + new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_tpu_tests.sh + env: + LD_LIBRARY_PATH: ${{ env.LD_LIBRARY_PATH }}:${{ env.Python_ROOT_DIR }}/lib + XRT_DEVICE_MAP: "CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0" + XRT_WORKERS: "localservice:0;grpc://localhost:40934" - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index a4b69725569..0b94e0d0e9e 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -31,7 +31,7 @@ concurrency: jobs: cpu-tests: runs-on: ${{ matrix.os }} - timeout-minutes: 45 + timeout-minutes: 85 defaults: run: shell: bash @@ -40,7 +40,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - python-version: ["3.8", "3.9", "3.10", "3.11","3.12"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] pytorch-channel: [pytorch, pytorch-nightly] include: # includes a single build on windows @@ -102,7 +102,7 @@ jobs: - name: Run Mypy # https://github.com/pytorch/ignite/pull/2780 - # + # if: ${{ matrix.os == 'ubuntu-latest' && matrix.pytorch-channel == 'pytorch-nightly'}} run: | bash ./tests/run_code_style.sh mypy @@ -120,8 +120,13 @@ jobs: cp -R /tmp/MNIST . - name: Run Tests - run: | - SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh + uses: nick-fields/retry@v3 + with: + max_attempts: 5 + timeout_minutes: 15 + shell: bash + command: SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh + new_command_on_retry: USE_LAST_FAILED=1 SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 diff --git a/tests/common-test-functionality.sh b/tests/common-test-functionality.sh new file mode 100644 index 00000000000..daf9d284f6b --- /dev/null +++ b/tests/common-test-functionality.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +# Will catch exit code 5 when tests are deselected from previous passing run +# (relevent for --last-failed-no-failures none) +last_failed_no_failures_code=5 + +# functions shared across test files +run_tests() { + # Set defaults + local core_args="-vvv tests/ignite" + local cache_dir=".unknown-cache" + local skip_distrib_tests=1 + local match_tests_expression="" + local trap_deselected_exit_code=1 + local use_last_failed=0 + local use_coverage=0 + local world_size=0 + # Always clean up pytest.ini + trap 'rm -f pytest.ini' RETURN + # Parse arguments + while [[ $# -gt 0 ]] + do + key="$1" + case $key in + --core_args) + core_args="$2" + shift + shift + ;; + --cache_dir) + cache_dir="$2" + shift + shift + ;; + --skip_distrib_tests) + skip_distrib_tests="$2" + shift + shift + ;; + --match_tests_expression) + match_tests_expression="$2" + shift + shift + ;; + --trap_deselected_exit_code) + trap_deselected_exit_code="$2" + shift + shift + ;; + --use_last_failed) + use_last_failed="$2" + shift + shift + ;; + --use_coverage) + use_coverage="$2" + shift + shift + ;; + --world_size) + world_size="$2" + shift + shift + ;; + *) + echo "Error: Unknown argument $key" + exit 1 + shift + ;; + esac + done + + if [ "${skip_distrib_tests}" -eq "1" ]; then + # can be overwritten by core_args + skip_distrib_opt="-m 'not distributed and not tpu and not multinode_distributed'" + else + skip_distrib_opt="" + fi + + + echo [pytest] > pytest.ini ; echo "cache_dir=${cache_dir}" >> pytest.ini + + # Assemble options for the pytest command + pytest_args="${skip_distrib_opt} ${core_args} --treat-unrun-as-failed -k '${match_tests_expression}'" + if [ "${use_last_failed:-0}" -eq "1" ] && [ -d "${cache_dir}" ]; then + pytest_args="--last-failed --last-failed-no-failures none ${pytest_args}" + fi + if [ "${use_coverage}" -eq "1" ]; then + pytest_args="--cov ignite --cov-append --cov-report term-missing --cov-report xml ${pytest_args}" + fi + if [ ! "${world_size}" -eq "0" ]; then + export WORLD_SIZE="${world_size}" + pytest_args="--dist=each --tx ${WORLD_SIZE}*popen//python=python ${pytest_args}" + fi + + # Run the command + if [ "$trap_deselected_exit_code" -eq "1" ]; then + CUDA_VISIBLE_DEVICES="" eval "pytest ${pytest_args}" || { exit_code=$?; if [ "$exit_code" -eq ${last_failed_no_failures_code} ]; then echo "All tests deselected"; else exit $exit_code; fi; } + else + CUDA_VISIBLE_DEVICES="" eval "pytest ${pytest_args}" + fi +} diff --git a/tests/ignite/conftest.py b/tests/ignite/conftest.py index 265ae97e3e7..d5546a75bae 100644 --- a/tests/ignite/conftest.py +++ b/tests/ignite/conftest.py @@ -1,8 +1,10 @@ import functools import os import shutil +import signal import sys import tempfile +import threading import time from pathlib import Path @@ -13,10 +15,57 @@ import ignite.distributed as idist +def pytest_addoption(parser): + """ + Add custom command line options for the ignite test suite here. + See: + This function is a pytest hook (due to its name) and is *"automatically" + executed at the start of a test run + https://docs.pytest.org/en/latest/reference/reference.html#initialization-hooks + + * "automatically" is true provided this conftest.py file is the + root directory. See: + https://docs.pytest.org/en/latest/reference/customize.html#initialization-determining-rootdir-and-configfile + """ + parser.addoption( + "--treat-unrun-as-failed", + action="store_true", + help=""" + If a session is interrupted, treat the unrun tests as failed so that a + rerun with --last-failed runs any tests that have not passed or been + skipped. Note that if all tests in a module have been skipped, the + module will be skipped for all subsequent runs. + """, + ) + + def pytest_configure(config): + """ + This function is a pytest hook (due to its name) and is run after command + line parsing is complete in order to configure the test session. + """ config.addinivalue_line("markers", "distributed: run distributed") config.addinivalue_line("markers", "multinode_distributed: distributed") config.addinivalue_line("markers", "tpu: run on tpu") + if config.option.treat_unrun_as_failed: + unrun_tracker = UnrunTracker() + config.pluginmanager.register(unrun_tracker, "unrun_tracker_plugin") + + +@pytest.fixture(scope="session", autouse=True) +def term_handler(): + """ + This allows the pytest session to be terminated upon retries on CI. It may + be worth using this fixture solely in that context. For a discussion on + whether sigterm should be ignored and why pytest usually ignores it see: + https://github.com/pytest-dev/pytest/issues/5243 + """ + if threading.current_thread() is threading.main_thread() and hasattr(signal, "SIGTERM"): + orig = signal.signal(signal.SIGTERM, signal.getsignal(signal.SIGINT)) + yield + signal.signal(signal.SIGTERM, orig) + else: + yield # Just pass through if SIGTERM isn't supported or we are not in the main thread @pytest.fixture( @@ -447,6 +496,40 @@ def distributed(request, local_rank, world_size): raise RuntimeError(f"Invalid parameter value for `distributed` fixture, given {request.param}") +class UnrunTracker: + """ + Keeps track of unrun tests to improve the user experience when using the + "--last-failed" pytest option and a test session is interrupted. This is + particularly useful on CI when rerunning "failing" tests where the failure + was due to a deadlock and many tests weren't actually run so they didn't + actually fail. This is a pytest plugin that implements some standard hooks + to modify the test session. Its functionality can be added to a test session + by registering it with the pytest plugin manager. + """ + + def __init__(self): + self.unrun_tests = [] + + def pytest_collection_finish(self, session): + # At the end of the collection, add all items to the unrun_tests list + self.unrun_tests.extend(session.items) + + def pytest_runtest_teardown(self, item): + if item in self.unrun_tests: + self.unrun_tests.remove(item) + + def record_unrun_as_failed(self, session, exitstatus): + # Get current lastfailed entries (if any) + lastfailed = session.config.cache.get("cache/lastfailed", {}) + + # Add unrun tests to lastfailed + for test in self.unrun_tests: + lastfailed[test.nodeid] = True + + # Update the cache with the new lastfailed + session.config.cache.set("cache/lastfailed", lastfailed) + + @pytest.hookimpl def pytest_pyfunc_call(pyfuncitem: pytest.Function) -> None: if any(fx in pyfuncitem.fixturenames for fx in ["distributed", "multinode_distributed"]): @@ -508,3 +591,16 @@ def xla_worker(index, fn): assert ex_.code == 0, "Didn't successfully exit in XLA test" pyfuncitem.obj = functools.partial(testfunc_wrapper, pyfuncitem.obj) + + +def pytest_sessionfinish(session, exitstatus): + """ + Any functionality that should be run at the end of the session should be + added here. + This is a pytest hook (due to its name) and is called after the whole test + run finished, right before returning the exit status to the system. + """ + # If requested by the user, track all unrun tests and add them to the lastfailed cache + if session.config.option.treat_unrun_as_failed: + unrun_tracker = session.config.pluginmanager.get_plugin("unrun_tracker_plugin") + unrun_tracker.record_unrun_as_failed(session, exitstatus) diff --git a/tests/run_cpu_tests.sh b/tests/run_cpu_tests.sh index 2297be94219..7d647de1e01 100644 --- a/tests/run_cpu_tests.sh +++ b/tests/run_cpu_tests.sh @@ -1,22 +1,31 @@ #!/bin/bash - +source "$(dirname "$0")/common-test-functionality.sh" set -xeu -if [ "${SKIP_DISTRIB_TESTS:-0}" -eq "1" ]; then - skip_distrib_opt=(-m "not distributed and not tpu and not multinode_distributed") -else - skip_distrib_opt=(-m "") -fi +skip_distrib_tests=${SKIP_DISTRIB_TESTS:-0} +use_last_failed=${USE_LAST_FAILED:-0} +match_tests_expression=${1:-""} -MATCH_TESTS_EXPRESSION=${1:-""} -CUDA_VISIBLE_DEVICES="" pytest --tx 4*popen//python=python --cov ignite --cov-report term-missing --cov-report xml -vvv tests "${skip_distrib_opt[@]}" -k "$MATCH_TESTS_EXPRESSION" +run_tests \ + --core_args "--tx 4*popen//python=python -vvv tests/ignite" \ + --cache_dir ".cpu-not-distrib" \ + --skip_distrib_tests "${skip_distrib_tests}" \ + --use_coverage 1 \ + --match_tests_expression "${match_tests_expression}" \ + --use_last_failed ${use_last_failed} # https://pubs.opengroup.org/onlinepubs/009695399/utilities/xcu_chap02.html#tag_02_06_02 -if [ "${SKIP_DISTRIB_TESTS:-0}" -eq "1" ]; then +if [ "${skip_distrib_tests}" -eq "1" ]; then exit 0 fi -export WORLD_SIZE=2 -CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx $WORLD_SIZE*popen//python=python tests -m distributed -vvv -k "$MATCH_TESTS_EXPRESSION" -unset WORLD_SIZE +# Run 2 processes with --dist=each +run_tests \ + --core_args "-m distributed -vvv tests/ignite" \ + --world_size 2 \ + --cache_dir ".cpu-distrib" \ + --skip_distrib_tests 0 \ + --use_coverage 1 \ + --match_tests_expression "${match_tests_expression}" \ + --use_last_failed ${use_last_failed} diff --git a/tests/run_gpu_tests.sh b/tests/run_gpu_tests.sh index 3146443a531..371c70aee37 100644 --- a/tests/run_gpu_tests.sh +++ b/tests/run_gpu_tests.sh @@ -1,35 +1,47 @@ #!/bin/bash +source "$(dirname "$0")/common-test-functionality.sh" +set -xeu -if [ -z "$1" ]; then - ngpus=1 -else - ngpus=$1 -fi - -MATCH_TESTS_EXPRESSION=${2:-""} +skip_distrib_tests=${SKIP_DISTRIB_TESTS:-1} +use_last_failed=${USE_LAST_FAILED:-0} +ngpus=${1:-1} -if [ -z "$MATCH_TESTS_EXPRESSION" ]; then +match_tests_expression=${2:-""} +if [ -z "$match_tests_expression" ]; then cuda_pattern="cuda" else - cuda_pattern="cuda and $MATCH_TESTS_EXPRESSION" + cuda_pattern="cuda and $match_tests_expression" fi -set -xeu - -pytest --cov ignite --cov-report term-missing --cov-report xml -vvv tests/ -k "$cuda_pattern" +run_tests \ + --core_args "-vvv tests/ignite" \ + --cache_dir ".gpu-cuda" \ + --skip_distrib_tests "${skip_distrib_tests}" \ + --use_coverage 1 \ + --match_tests_expression "${cuda_pattern}" \ + --use_last_failed ${use_last_failed} # https://pubs.opengroup.org/onlinepubs/009695399/utilities/xcu_chap02.html#tag_02_06_02 -if [ "${SKIP_DISTRIB_TESTS:-0}" -eq "1" ]; then +if [ "${skip_distrib_tests}" -eq "1" ]; then exit 0 fi -pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k "$MATCH_TESTS_EXPRESSION" +run_tests \ + --core_args "-vvv -m distributed tests/ignite" \ + --cache_dir ".gpu-distrib" \ + --skip_distrib_tests 0 \ + --use_coverage 1 \ + --match_tests_expression "${match_tests_expression}" \ + --use_last_failed ${use_last_failed} if [ ${ngpus} -gt 1 ]; then - - export WORLD_SIZE=${ngpus} - pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx ${WORLD_SIZE}*popen//python=python tests -m distributed -vvv -k "$MATCH_TESTS_EXPRESSION" - unset WORLD_SIZE - + run_tests \ + --core_args "-vvv -m distributed tests/ignite" \ + --world_size "${ngpus}" \ + --cache_dir ".gpu-distrib-multi" \ + --skip_distrib_tests 0 \ + --use_coverage 1 \ + --match_tests_expression "${match_tests_expression}" \ + --use_last_failed ${use_last_failed} fi diff --git a/tests/run_multinode_tests_in_docker.sh b/tests/run_multinode_tests_in_docker.sh index 0dca1b60327..041284bb97c 100644 --- a/tests/run_multinode_tests_in_docker.sh +++ b/tests/run_multinode_tests_in_docker.sh @@ -36,7 +36,7 @@ RUN pip install --no-cache-dir mock pytest pytest-xdist scikit-learn scikit-imag EOF docker_python_version=`docker run --rm -i $docker_image python -c "import sys; print(str(sys.version_info[0]) + \".\" + str(sys.version_info[1]), end=\"\")"` -cmd="pytest --dist=each --tx $nproc_per_node*popen//python${docker_python_version} -m multinode_distributed -vvv tests" +cmd="pytest --dist=each --tx $nproc_per_node*popen//python${docker_python_version} -m multinode_distributed -vvv tests/ignite" export MASTER_ADDR=node0 export MASTER_PORT=9999 diff --git a/tests/run_tpu_tests.sh b/tests/run_tpu_tests.sh index 0877de858ae..5ea0f993173 100644 --- a/tests/run_tpu_tests.sh +++ b/tests/run_tpu_tests.sh @@ -1,10 +1,20 @@ #!/bin/bash - +source "$(dirname "$0")/common-test-functionality.sh" set -xeu +use_last_failed=${USE_LAST_FAILED:-0} + +run_tests \ + --core_args "-vvv -m tpu tests/ignite" \ + --cache_dir ".tpu" \ + --use_coverage 1 \ + --use_last_failed ${use_last_failed} -pytest --cov ignite --cov-report term-missing --cov-report xml tests/ -vvv -m tpu if [ -z ${NUM_TPU_WORKERS+x} ]; then export NUM_TPU_WORKERS=1 - pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml tests/ -vvv -m tpu + run_tests \ + --core_args "-vvv -m tpu tests/ignite" \ + --cache_dir ".tpu-multi" \ + --use_coverage 1 \ + --use_last_failed ${use_last_failed} fi