diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ad20fac..446c46a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,6 +1,6 @@ # Contributing -## Testing Scripts Locally +## Testing Locally Before opening a pull-request, it is recommended to test all changes locally. @@ -65,3 +65,33 @@ Since the current working directory is a standard RAPIDS repository, the CI scri ```sh ./ci/test_python.sh ``` + +## Testing in CI + +The tools here are all executable, so testing an alternative branch just requires downloading +the files and putting the `tools/` directory on `PATH`. + +For example, create a script called `use_gha_tools_from_branch.sh` in the following in the project's `ci/` directory. + +```shell +#!/bin/bash + +# fill these in +GHA_TOOLS_BRANCH= +GHA_TOOLS_REPO_ORG= + +git clone \ + --branch ${GHA_TOOLS_BRANCH} \ + https://github.com/${GHA_TOOLS_REPO_ORG}/gha-tools.git \ + /tmp/gha-tools + +unset GHA_TOOLS_BRANCH GHA_TOOLS_REPO_ORG + +export PATH="/tmp/gha-tools/tools":$PATH +``` + +Source that script in all the `ci/` scripts. + +```shell +source ./ci/use_gha_tools_from_branch.sh +``` diff --git a/tools/rapids-conda-retry b/tools/rapids-conda-retry index 974f8d4..8f064eb 100755 --- a/tools/rapids-conda-retry +++ b/tools/rapids-conda-retry @@ -2,12 +2,8 @@ # # rapids-conda-retry # -# wrapper for conda that retries the command after a CondaHTTPError, -# ChecksumMismatchError, or JSONDecodeError (ideally, any conda error that -# is normally resolved by retrying) -# -# This must be set in order for the script to recognize failing exit codes when -# output is piped to tee +# wrapper for conda that retries the command after retryable errors like +# CondaHTTPError, ChecksumMismatchError, and JSONDecodeError # # Example usage: # $ rapids-conda-retry install cudatoolkit=11.0 rapids=0.16 @@ -23,9 +19,18 @@ # # RAPIDS_CONDA_RETRY_SLEEP - set to a positive integer to set the duration, in # seconds, to wait between retries. -# Default is a 10 second sleep +# Default is a 10 second sleep. +# +# RAPIDS_CONDA_RETRY_TIMEOUT - Timeout for each individual retry. +# Positive integers are interpreted as seconds, +# but unit strings like '2h' for "two hours" will also work. +# Default varies based on the command being run. # + +# This must be set in order for the script to recognize failing exit codes when +# output is piped to tee set -o pipefail + export RAPIDS_SCRIPT_NAME="rapids-conda-retry" condaretry_help=" @@ -43,6 +48,11 @@ ALSO rapids-conda-retry options can be set using the following env vars: RAPIDS_CONDA_RETRY_SLEEP - set to a positive integer to set the duration, in seconds, to wait between retries. Default is a 10 second sleep + + RAPIDS_CONDA_RETRY_TIMEOUT - Timeout for each individual retry. + Positive integers are interpreted as seconds, + but unit strings like '2h' for 'two hours' will also work. + Default varies based on the command being run. ========== " max_retries=${RAPIDS_CONDA_RETRY_MAX:=3} @@ -67,7 +77,7 @@ condaCmd=${RAPIDS_CONDA_EXE:=conda} # needToRetry: 1 if the command should be retried, 0 if it should not be function runConda { # shellcheck disable=SC2086 - ${condaCmd} ${args} 2>&1| tee "${outfile}" + timeout --verbose "${timeout_duration}" ${condaCmd} ${args} 2>&1| tee "${outfile}" exitcode=$? needToRetry=0 needToClean=0 @@ -130,6 +140,13 @@ function runConda { retryingMsg="Retrying, command resulted in a segfault. This may be an intermittent failure..." needToRetry=1 needToClean=1 + elif [[ $exitcode -eq 124 ]]; then + # 'timeout' returns exit code 124 when the timeout is exceeded. + # ref: https://man7.org/linux/man-pages/man1/timeout.1.html + exitMsg="Exiting, command exited with status 124 which often indicates a timeout (configured timeout='${timeout_duration}')." + exitMsg+=" To increase this timeout, set env variable RAPIDS_CONDA_RETRY_TIMEOUT." + rapids-echo-stderr "${exitMsg}" + needToRetry=0 else rapids-echo-stderr "Exiting, no retryable ${RAPIDS_CONDA_EXE} errors detected: \ 'ChecksumMismatchError:', \ @@ -151,16 +168,17 @@ function runConda { segfault exit code 139" fi - if (( needToRetry == 1 )) && \ - (( retries >= max_retries )); then - # Catch instance where we run out of retries - rapids-echo-stderr "Exiting, reached max retries..." - else - # Give reason for retry - rapids-echo-stderr "${retryingMsg}" - if (( needToClean == 1 )); then - rapids-echo-stderr "Cleaning tarball cache before retrying..." - ${condaCmd} clean --tarballs -y + if (( needToRetry == 1 )); then + if (( retries >= max_retries )); then + # Catch instance where we run out of retries + rapids-echo-stderr "Exiting, reached max retries..." + else + # Give reason for retry + rapids-echo-stderr "${retryingMsg}" + if (( needToClean == 1 )); then + rapids-echo-stderr "Cleaning tarball cache before retrying..." + ${condaCmd} clean --tarballs -y + fi fi fi fi @@ -184,6 +202,24 @@ for arg in "$@"; do fi done +# Set a default timeout based on command being run. +# +# This prevents occupying a CI runner for too long in cases where +# other timeout mechanisms in 'conda' / 'mamba' are not sufficient to +# interrupt a long-running operation. +if [ -n "${RAPIDS_CONDA_RETRY_TIMEOUT:-}" ]; then + # allow timeout to be set by an environment variable + timeout_duration=${RAPIDS_CONDA_RETRY_TIMEOUT} +elif grep -q -E 'install|env.*create|env.*update' <<< "${args}"; then + # 'conda install', 'conda env create', 'conda env update' should never run for more than 45 minutes + timeout_duration='45m' +else + # other commands falling here might include 'conda mambabuild' or similar, + # which could take several hours for expensive-to-build projects + timeout_duration='6h' +fi +rapids-echo-stderr "timeout for conda operations: '${timeout_duration}'" + # Run command outfile=$(mktemp) # shellcheck disable=SC2086 diff --git a/tools/rapids-mamba-retry b/tools/rapids-mamba-retry index 5db5d00..907c9ed 100755 --- a/tools/rapids-mamba-retry +++ b/tools/rapids-mamba-retry @@ -17,7 +17,12 @@ # seconds, to wait between retries. # Default is a 10 second sleep # -# These are copied to RAPIDS_CONDA_RETRY_MAX and RAPIDS_CONDA_RETRY_SLEEP +# RAPIDS_MAMBA_RETRY_TIMEOUT - Timeout for each individual retry. +# Positive integers are interpreted as seconds, +# but unit strings like '2h' for "two hours" will also work. +# Default varies based on the command being run. +# +# These are copied to RAPIDS_CONDA_RETRY_MAX, RAPIDS_CONDA_RETRY_SLEEP, and RAPIDS_CONDA_RETRY_TIMEOUT. # # Similarly, the options `--mambaretry_...` are copied to their `--condaretry_...` equivalents @@ -33,6 +38,10 @@ if [[ -v RAPIDS_MAMBA_RETRY_SLEEP ]]; then export RAPIDS_CONDA_RETRY_SLEEP="${RAPIDS_MAMBA_RETRY_SLEEP}" fi +if [[ -v RAPIDS_MAMBA_RETRY_TIMEOUT ]]; then + export RAPIDS_CONDA_RETRY_TIMEOUT="${RAPIDS_MAMBA_RETRY_TIMEOUT}" +fi + # Rename --mambaretry_... options to --condaretry_... for arg in "$@"; do opt=${arg%%=*}