From 689a697c3af7dafc8df6e1cbb45b2a3bb0ac8587 Mon Sep 17 00:00:00 2001 From: Brian Curtis Date: Fri, 22 Mar 2024 13:17:00 +0000 Subject: [PATCH] Bring in less hard coded values for ecflow_run --- tests/rt.sh | 62 +++++++++++++++++++++++------------------------ tests/rt_utils.sh | 58 +++++++++++++++++++++++++++++--------------- 2 files changed, 69 insertions(+), 51 deletions(-) diff --git a/tests/rt.sh b/tests/rt.sh index c359dcb607..35057f2ffc 100755 --- a/tests/rt.sh +++ b/tests/rt.sh @@ -522,8 +522,8 @@ cleanup() { trap '{ echo "rt.sh interrupted"; rt_trap ; }' INT trap '{ echo "rt.sh quit"; rt_trap ; }' QUIT trap '{ echo "rt.sh terminated"; rt_trap ; }' TERM -trap '{ echo "rt.sh error on line $LINENO"; cleanup ; }' ERR -trap '{ echo "rt.sh finished"; cleanup ; }' EXIT +trap '{ echo "rt.sh error on line $LINENO"; rt_trap ; }' ERR +trap '{ echo "rt.sh finished"; rt_trap ; }' EXIT # PATHRT - Path to regression tests directory PATHRT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd -P )" @@ -544,7 +544,7 @@ else exit 1 fi -source detect_machine.sh # Note: this does not set ACCNR. The "if" block below does. +source detect_machine.sh source rt_utils.sh # shellcheck disable=SC1091 source module-setup.sh @@ -656,36 +656,12 @@ fi # Display the machine and account using the format detect_machine.sh used: echo "Machine: ${MACHINE_ID} | Account: ${ACCNR}" -if [[ ${MACHINE_ID} = wcoss2 ]]; then +if [[ ${MACHINE_ID} = wcoss2 || ${MACHINE_ID} = acorn ]]; then module load ecflow/5.6.0.13 module load intel/19.1.3.304 module load python/3.8.6 - # ECFLOW_START="${ECF_ROOT}/scripts/server_check.sh" - # export ECF_OUTPUTDIR="${PATHRT}/ecf_outputdir" - # export ECF_COMDIR="${PATHRT}/ecf_comdir" - # rm -rf "${ECF_OUTPUTDIR}" "${ECF_COMDIR}" - # mkdir -p "${ECF_OUTPUTDIR}" - # mkdir -p "${ECF_COMDIR}" - export colonifnco=":output" # hack - - DISKNM=/lfs/h2/emc/nems/noscrub/emc.nems/RT - QUEUE=dev - COMPILE_QUEUE=dev - ROCOTO_SCHEDULER=pbs - PARTITION= - STMP="/lfs/h2/emc/ptmp" - PTMP="/lfs/h2/emc/ptmp" - SCHEDULER="pbs" - -elif [[ ${MACHINE_ID} = acorn ]]; then - - module load ecflow/5.6.0.13 - module load intel/19.1.3.304 - module load python/3.8.6 - INPUTDATA_ROOT=${INPUTDATA_ROOT:-${DISKNM}/NEMSfv3gfs/input-data-20221101} - ECF_ROOT=${ECF_ROOT:-} ECFLOW_START="${ECF_ROOT}/scripts/server_check.sh" export ECF_OUTPUTDIR="${PATHRT}/ecf_outputdir" export ECF_COMDIR="${PATHRT}/ecf_comdir" @@ -694,7 +670,7 @@ elif [[ ${MACHINE_ID} = acorn ]]; then mkdir -p "${ECF_COMDIR}" export colonifnco=":output" # hack - DISKNM=/lfs/h1/emc/nems/noscrub/emc.nems/RT + DISKNM=/lfs/h2/emc/nems/noscrub/emc.nems/RT QUEUE=dev COMPILE_QUEUE=dev ROCOTO_SCHEDULER=pbs @@ -703,6 +679,30 @@ elif [[ ${MACHINE_ID} = acorn ]]; then PTMP="/lfs/h2/emc/ptmp" SCHEDULER="pbs" +# elif [[ ${MACHINE_ID} = acorn ]]; then + +# module load ecflow/5.6.0.13 +# module load intel/19.1.3.304 +# module load python/3.8.6 +# INPUTDATA_ROOT=${INPUTDATA_ROOT:-${DISKNM}/NEMSfv3gfs/input-data-20221101} +# ECF_ROOT=${ECF_ROOT:-} +# ECFLOW_START="${ECF_ROOT}/scripts/server_check.sh" +# export ECF_OUTPUTDIR="${PATHRT}/ecf_outputdir" +# export ECF_COMDIR="${PATHRT}/ecf_comdir" +# rm -rf "${ECF_OUTPUTDIR}" "${ECF_COMDIR}" +# mkdir -p "${ECF_OUTPUTDIR}" +# mkdir -p "${ECF_COMDIR}" +# export colonifnco=":output" # hack + +# DISKNM=/lfs/h2/emc/nems/noscrub/emc.nems/RT +# QUEUE=dev +# COMPILE_QUEUE=dev +# ROCOTO_SCHEDULER=pbs +# PARTITION= +# STMP="/lfs/h2/emc/ptmp" +# PTMP="/lfs/h2/emc/ptmp" + # SCHEDULER="pbs" + elif [[ ${MACHINE_ID} = gaea ]]; then module use /ncrc/proj/epic/rocoto/modulefiles @@ -919,7 +919,7 @@ else fi # Does this machine support Rocoto? -if [[ -n ${ROCOTO} ]]; then +if [[ ${ROCOTO} == true ]]; then if [[ ${MACHINE_ID} != wcoss2 && ${MACHINE_ID} != acorn && ${MACHINE_ID} != expanse && ${MACHINE_ID} != stampede ]]; then ROCOTORUN="$(command -v rocotorun)" export ROCOTORUN @@ -933,7 +933,7 @@ if [[ -n ${ROCOTO} ]]; then fi # Does this machine support ecflow? -if [[ -n ${ECFLOW} ]]; then +if [[ ${ECFLOW} == true ]]; then if [[ ${MACHINE_ID} == wcoss2 && ${MACHINE_ID} == acorn ]]; then ECFLOW_START="$(command -v server_check.sh)" diff --git a/tests/rt_utils.sh b/tests/rt_utils.sh index b78c15e597..b75aa57327 100755 --- a/tests/rt_utils.sh +++ b/tests/rt_utils.sh @@ -763,39 +763,57 @@ ecflow_run() { ECF_HOST="${ECF_HOST:-${HOSTNAME}}" - set +e + #set +e + # Make sure ECF_HOST and ECF_PORT are set/ready on systems that have an + # explicit ecflow node + if [[ ${MACHINE_ID} == wcoss2 || ${MACHINE_ID} == acorn ]]; then + readarray -t ECFHOSTLIST < "${ECF_HOSTFILE}" + for ECF_HOST in "${ECFHOSTLIST[@]}" + do + if ssh -t -t "${ECF_HOST}"; then + export ECF_HOST + break + else + ECF_HOST='' + fi + done + elif [[ ${MACHINE_ID} == hera || ${MACHINE_ID} == jet ]]; then + module load ecflow + fi + if [[ -z ${ECF_HOST} || -z ${ECF_PORT} ]]; then + echo "ERROR: ECF_HOST or ECF_PORT are not set, and rt.sh cannot continue with ECFLOW" + exit 1 + else + echo "ECF_HOST: ${ECF_HOST}, ECF_PORT: ${ECF_PORT}" + fi + + # Start the ecflow_server ecflow_client --ping --host="${ECF_HOST}" --port="${ECF_PORT}" not_running=$? if [[ ${not_running} -eq 1 ]]; then echo "ecflow_server is NOT running on ${ECF_HOST}:${ECF_PORT}" if [[ ${MACHINE_ID} == wcoss2 || ${MACHINE_ID} == acorn ]]; then - if [[ "${HOST::1}" == "a" ]]; then - export ECF_HOST=aecflow01 - elif [[ "${HOST::1}" == "c" ]]; then - export ECF_HOST=cdecflow01 - elif [[ "${HOST::1}" == "d" ]]; then - export ECF_HOST=ddecflow01 - fi #shellcheck disable=SC2029 - ssh "${ECF_HOST}" "bash -l -c \"module load ecflow && ecflow_start.sh -p ${ECF_PORT}\"" + ssh "${ECF_HOST}" "bash -l -c \"module load ecflow && ${ECFLOW_START} -p ${ECF_PORT}\"" elif [[ ${MACHINE_ID} == hera || ${MACHINE_ID} == jet ]]; then - module load ecflow - echo "On ${MACHINE_ID}, start ecFlow server on dedicated node ${ECF_HOST}" #shellcheck disable=SC2029 ssh "${ECF_HOST}" "bash -l -c \"module load ecflow && ${ECFLOW_START} -d ${RUNDIR_ROOT}/ecflow_server\"" else ${ECFLOW_START} -p "${ECF_PORT}" -d "${RUNDIR_ROOT}/ecflow_server" fi + + # Try pinging ecflow server now, and erroring out if not there. + ecflow_client --ping --host="${ECF_HOST}" --port="${ECF_PORT}" + not_running=$? + if [[ ${not_running} -eq 1 ]]; then + echo "ERROR: Failure to start ecflow, exiting..." + exit 1 + fi else echo "ecflow_server is already running on ${ECF_HOST}:${ECF_PORT}" fi - set -e - + #set -e ECFLOW_RUNNING=true - - export ECF_PORT - export ECF_HOST - ecflow_client --load="${ECFLOW_RUN}/${ECFLOW_SUITE}.def" ecflow_client --begin="${ECFLOW_SUITE}" ecflow_client --restart @@ -819,16 +837,16 @@ ecflow_run() { ecflow_kill() { [[ ${ECFLOW_RUNNING:-false} == true ]] || return - set +e + #set +e ecflow_client --suspend "/${ECFLOW_SUITE}" ecflow_client --kill "/${ECFLOW_SUITE}" sleep 20 - ecflow_client --delete=force yes"/${ECFLOW_SUITE}" + ecflow_client --delete=force yes "/${ECFLOW_SUITE}" } ecflow_stop() { [[ ${ECFLOW_RUNNING:-false} == true ]] || return - set +e + #set +e SUITES=$( ecflow_client --get ) SUITES=$( grep "^suite" <<< "${SUITES}" ) #SUITES=$( ecflow_client --get | grep "^suite" )