Skip to content

Commit

Permalink
retry with last failed logic
Browse files Browse the repository at this point in the history
  • Loading branch information
leej3 committed Apr 4, 2024
1 parent 9d6ef49 commit 5b8c164
Show file tree
Hide file tree
Showing 8 changed files with 15 additions and 9 deletions.
1 change: 1 addition & 0 deletions .github/workflows/hvd-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ jobs:
timeout_minutes: 25
shell: bash
command: bash tests/run_cpu_tests.sh
new_command_on_retry: EXTRA_PYTEST_ARGS="--last-failed --last-failed-no-failures none" bash tests/run_cpu_tests.sh

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/pytorch-version-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ jobs:
timeout_minutes: 25
shell: bash
command: bash tests/run_cpu_tests.sh "not test_time_profilers"
new_command_on_retry: EXTRA_PYTEST_ARGS="--last-failed --last-failed-no-failures none" bash tests/run_cpu_tests.sh "not test_time_profilers"

# create-issue:
# runs-on: ubuntu-latest
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/tpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ jobs:
command: |
python -c "import torch_xla; print('torch xla version:', torch_xla.__version__)"
bash tests/run_tpu_tests.sh
new_command_on_retry: EXTRA_PYTEST_ARGS="--last-failed --last-failed-no-failures none" bash tests/run_tpu_tests.sh
env:
LD_LIBRARY_PATH: ${{ env.LD_LIBRARY_PATH }}:${{ env.Python_ROOT_DIR }}/lib
XRT_DEVICE_MAP: "CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0"
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ jobs:
timeout_minutes: 25
shell: bash
command: SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
new_command_on_retry: EXTRA_PYTEST_ARGS="--last-failed --last-failed-no-failures none" SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
Expand Down
5 changes: 3 additions & 2 deletions tests/run_cpu_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@ fi

MATCH_TESTS_EXPRESSION=${1:-""}

CUDA_VISIBLE_DEVICES="" pytest --tx 4*popen//python=python --cov ignite --cov-report term-missing --cov-report xml -vvv tests "${skip_distrib_opt[@]}" "${EXTRA_PYTEST_ARGS:-}" -k "$MATCH_TESTS_EXPRESSION"
# Catch exit code 5 when tests are deselected from previous passing run
CUDA_VISIBLE_DEVICES="" pytest ${EXTRA_PYTEST_ARGS:-} --tx 4*popen//python=python --cov ignite --cov-report term-missing --cov-report xml -vvv tests "${skip_distrib_opt[@]}" -k "$MATCH_TESTS_EXPRESSION" || { exit_code=$?; if [ "$exit_code" -eq 5 ]; then echo "All tests deselected"; else exit $exit_code; fi;}

# https://pubs.opengroup.org/onlinepubs/009695399/utilities/xcu_chap02.html#tag_02_06_02
if [ "${SKIP_DISTRIB_TESTS:-0}" -eq "1" ]; then
exit 0
fi

export WORLD_SIZE=2
CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx $WORLD_SIZE*popen//python=python tests -m distributed -vvv "${EXTRA_PYTEST_ARGS:-}" -k "$MATCH_TESTS_EXPRESSION"
CUDA_VISIBLE_DEVICES="" pytest ${EXTRA_PYTEST_ARGS:-} --cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx $WORLD_SIZE*popen//python=python tests -m distributed -vvv -k "$MATCH_TESTS_EXPRESSION"
unset WORLD_SIZE
8 changes: 4 additions & 4 deletions tests/run_gpu_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,21 @@ else
fi

set -xeu

pytest --cov ignite --cov-report term-missing --cov-report xml -vvv tests/ "${EXTRA_PYTEST_ARGS:-}" -k "$cuda_pattern"
# Catch exit code 5 when tests are deselected from previous passing run
pytest ${EXTRA_PYTEST_ARGS:-} --cov ignite --cov-report term-missing --cov-report xml -vvv tests/ -k "$cuda_pattern" || { exit_code=$?; if [ "$exit_code" -eq 5 ]; then echo "All tests deselected"; else exit $exit_code; fi;}

# https://pubs.opengroup.org/onlinepubs/009695399/utilities/xcu_chap02.html#tag_02_06_02
if [ "${SKIP_DISTRIB_TESTS:-0}" -eq "1" ]; then
exit 0
fi

pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed "${EXTRA_PYTEST_ARGS:-}" -k "$MATCH_TESTS_EXPRESSION"
pytest ${EXTRA_PYTEST_ARGS:-} --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k "$MATCH_TESTS_EXPRESSION"


if [ ${ngpus} -gt 1 ]; then

export WORLD_SIZE=${ngpus}
pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx ${WORLD_SIZE}*popen//python=python tests -m distributed -vvv "${EXTRA_PYTEST_ARGS:-}" -k "$MATCH_TESTS_EXPRESSION"
pytest ${EXTRA_PYTEST_ARGS:-} --cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx ${WORLD_SIZE}*popen//python=python tests -m distributed -vvv -k "$MATCH_TESTS_EXPRESSION"
unset WORLD_SIZE

fi
2 changes: 1 addition & 1 deletion tests/run_multinode_tests_in_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ RUN pip install --no-cache-dir mock pytest pytest-xdist scikit-learn scikit-imag
EOF

docker_python_version=`docker run --rm -i $docker_image python -c "import sys; print(str(sys.version_info[0]) + \".\" + str(sys.version_info[1]), end=\"\")"`
cmd="pytest --dist=each --tx $nproc_per_node*popen//python${docker_python_version} -m multinode_distributed -vvv ${EXTRA_PYTEST_ARGS} tests"
cmd="pytest --dist=each --tx $nproc_per_node*popen//python${docker_python_version} -m multinode_distributed -vvv ${EXTRA_PYTEST_ARGS:-} tests"

export MASTER_ADDR=node0
export MASTER_PORT=9999
Expand Down
5 changes: 3 additions & 2 deletions tests/run_tpu_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

set -xeu

pytest --cov ignite --cov-report term-missing --cov-report xml tests/ -vvv -m tpu "${EXTRA_PYTEST_ARGS:-}"
# Catch exit code 5 when tests are deselected from previous passing run
pytest ${EXTRA_PYTEST_ARGS:-} --cov ignite --cov-report term-missing --cov-report xml tests/ -vvv -m tpu || { exit_code=$?; if [ "$exit_code" -eq 5 ]; then echo "All tests deselected"; else exit $exit_code; fi;}

if [ -z ${NUM_TPU_WORKERS+x} ]; then
export NUM_TPU_WORKERS=1
pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml tests/ -vvv -m tpu "${EXTRA_PYTEST_ARGS:-}"
pytest ${EXTRA_PYTEST_ARGS:-} --cov ignite --cov-append --cov-report term-missing --cov-report xml tests/ -vvv -m tpu
fi

0 comments on commit 5b8c164

Please sign in to comment.