Skip to content

Commit

Permalink
Cloud Dependencies for Regression Testing (#1667)
Browse files Browse the repository at this point in the history
* Update default_vars.sh to add noaacloud

* Update detect_machine.sh to add noaacloud

* Update module-setup.sh to add noaacloud env variables

* Add noaacloud and env variable to rt.sh

* Update module-setup.sh

* Create ufs_noaacloud.intel.lua

* Create ufs_common_spack.lua

* Create compile_slurm.IN_noaacloud

* Create fv3_slurm.IN_noaacloud

* Update rt_utils.sh

* Adding Requesting if statement to default_vars

adding -B option back to nccmp
  • Loading branch information
zach1221 authored May 3, 2023
1 parent 3343326 commit ee35891
Show file tree
Hide file tree
Showing 16 changed files with 3,173 additions and 2,972 deletions.
57 changes: 57 additions & 0 deletions modulefiles/ufs_common_spack.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
help([[
loads UFS Model common libraries
]])

jasper_ver=os.getenv("jasper_ver") or "2.0.32"
load(pathJoin("jasper", jasper_ver))

zlib_ver=os.getenv("zlib_ver") or "1.2.13"
load(pathJoin("zlib", zlib_ver))

libpng_ver=os.getenv("libpng_ver") or "1.6.37"
load(pathJoin("libpng", libpng_ver))

hdf5_ver=os.getenv("hdf5_ver") or "1.14.0"
load(pathJoin("hdf5", hdf5_ver))

netcdf_ver=os.getenv("netcdf_ver") or "4.9.0"
load(pathJoin("netcdf-c", netcdf_ver))
load(pathJoin("netcdf-fortran", "4.6.0"))

pio_ver=os.getenv("pio_ver") or "2.5.9"
load(pathJoin("parallelio", pio_ver))

esmf_ver=os.getenv("esmf_ver") or "8.3.0b09"
load(pathJoin("esmf", esmf_ver))

fms_ver=os.getenv("fms_ver") or "2022.04"
load(pathJoin("fms",fms_ver))

bacio_ver=os.getenv("bacio_ver") or "2.4.1"
load(pathJoin("bacio", bacio_ver))

crtm_ver=os.getenv("crtm_ver") or "2.4.0"
load(pathJoin("crtm", crtm_ver))

g2_ver=os.getenv("g2_ver") or "3.4.5"
load(pathJoin("g2", g2_ver))

g2tmpl_ver=os.getenv("g2tmpl_ver") or "1.10.2"
load(pathJoin("g2tmpl", g2tmpl_ver))

ip_ver=os.getenv("ip_ver") or "3.3.3"
load(pathJoin("ip", ip_ver))

sp_ver=os.getenv("sp_ver") or "2.3.3"
load(pathJoin("sp", sp_ver))

w3emc_ver=os.getenv("w3emc_ver") or "2.9.2"
load(pathJoin("w3emc", w3emc_ver))

gftl_shared_ver=os.getenv("gftl_shared_ver") or "v1.5.0"
load(pathJoin("gftl-shared", gftl_shared_ver))

mapl_ver=os.getenv("mapl_ver") or "2.22.0-esmf-8.3.0b09"
load(pathJoin("mapl", mapl_ver))

whatis("Description: UFS build environment common libraries")
28 changes: 28 additions & 0 deletions modulefiles/ufs_noaacloud.intel.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
help([[
loads UFS Model prerequisites for NOAA Parallelworks/Intel
]])


prepend_path("MODULEPATH", "/contrib/EPIC/spack-stack/spack-stack-1.3.0/envs/unified-dev/install/modulefiles/Core")

stack_intel_ver=os.getenv("stack_intel_ver") or "2021.3.0"
load(pathJoin("stack-intel", stack_intel_ver))

stack_impi_ver=os.getenv("stack_impi_ver") or "2021.3.0"
load(pathJoin("stack-intel-oneapi-mpi", stack_impi_ver))

cmake_ver=os.getenv("cmake_ver") or "3.23.1"
load(pathJoin("cmake", cmake_ver))

prepend_path("MODULEPATH", "/contrib/spack-stack/modulefiles/core")
stack_python_ver=os.getenv("stack_python_ver") or "3.9.12"
load(pathJoin("stack-python", stack_python_ver))

load("ufs_common_spack")

setenv("CC", "mpiicc")
setenv("CXX", "mpiicpc")
setenv("FC", "mpiifort")
setenv("CMAKE_Platform", "noaacloud.intel")

whatis("Description: UFS build environment")
352 changes: 176 additions & 176 deletions tests/RegressionTests_cheyenne.gnu.log

Large diffs are not rendered by default.

1,093 changes: 531 additions & 562 deletions tests/RegressionTests_cheyenne.intel.log

Large diffs are not rendered by default.

354 changes: 177 additions & 177 deletions tests/RegressionTests_hera.gnu.log

Large diffs are not rendered by default.

1,074 changes: 537 additions & 537 deletions tests/RegressionTests_hera.intel.log

Large diffs are not rendered by default.

1,066 changes: 539 additions & 527 deletions tests/RegressionTests_jet.intel.log

Large diffs are not rendered by default.

1,088 changes: 544 additions & 544 deletions tests/RegressionTests_orion.intel.log

Large diffs are not rendered by default.

888 changes: 444 additions & 444 deletions tests/RegressionTests_wcoss2.intel.log

Large diffs are not rendered by default.

31 changes: 30 additions & 1 deletion tests/default_vars.sh
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,34 @@ elif [[ $MACHINE_ID = stampede.* ]]; then
TPN_cpl_atmw_gdas=12; INPES_cpl_atmw_gdas=6; JNPES_cpl_atmw_gdas=8
THRD_cpl_atmw_gdas=4; WPG_cpl_atmw_gdas=24; APB_cpl_atmw_gdas="0 311"; WPB_cpl_atmw_gdas="312 559"

elif [[ ${MACHINE_ID} = noaacloud.* ]] ; then

if [[ $PW_CSP == aws ]]; then
TPN=36
elif [[ $PW_CSP == azure ]]; then
TPN=44
elif [[ $PW_CSP == google ]]; then
TPN=30
fi

INPES_dflt=3 ; JNPES_dflt=8
INPES_thrd=3 ; JNPES_thrd=4

INPES_c384=8 ; JNPES_c384=6 ; THRD_c384=2
INPES_c768=8 ; JNPES_c768=16 ; THRD_c768=2

THRD_cpl_dflt=1
INPES_cpl_dflt=3; JNPES_cpl_dflt=8; WPG_cpl_dflt=6
OCN_tasks_cpl_dflt=20
ICE_tasks_cpl_dflt=10
WAV_tasks_cpl_dflt=20

THRD_cpl_thrd=2
INPES_cpl_thrd=3; JNPES_cpl_thrd=4; WPG_cpl_thrd=6
OCN_tasks_cpl_thrd=20
ICE_tasks_cpl_thrd=10
WAV_tasks_cpl_thrd=12

elif [[ $MACHINE_ID = expanse.* ]]; then

echo "Unknown MACHINE_ID ${MACHINE_ID}. Please update tasks configurations in default_vars.sh"
Expand All @@ -269,7 +297,8 @@ elif [[ $MACHINE_ID = expanse.* ]]; then

TPN_cpl_atmw_gdas=12; INPES_cpl_atmw_gdas=6; JNPES_cpl_atmw_gdas=8
THRD_cpl_atmw_gdas=2; WPG_cpl_atmw_gdas=24; APB_cpl_atmw_gdas="0 311"; WPB_cpl_atmw_gdas="312 559"



else

echo "Unknown MACHINE_ID ${MACHINE_ID}"
Expand Down
13 changes: 12 additions & 1 deletion tests/detect_machine.sh
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,21 @@ case $(hostname -f) in
login2.stampede2.tacc.utexas.edu) MACHINE_ID=stampede ;; ### stampede2
login3.stampede2.tacc.utexas.edu) MACHINE_ID=stampede ;; ### stampede3
login4.stampede2.tacc.utexas.edu) MACHINE_ID=stampede ;; ### stampede4



login01.expanse.sdsc.edu) MACHINE_ID=expanse ;; ### expanse1
login02.expanse.sdsc.edu) MACHINE_ID=expanse ;; ### expanse2

esac

case $(echo $PW_CSP) in

aws) MACHINE_ID=aws ;; ### parallelworks aws
google) MACHINE_ID=gcp ;; ### parallelworks gcp
azure) MACHINE_ID=azure ;; ### parallelworks azure

esac
[[ ${MACHINE_ID} =~ "aws" || ${MACHINE_ID} =~ "gcp" || ${MACHINE_ID} =~ "azure" ]] && MACHINE_ID=noaacloud

# Overwrite auto-detect with RT_MACHINE if set
MACHINE_ID=${RT_MACHINE:-${MACHINE_ID}}
Expand Down
17 changes: 17 additions & 0 deletions tests/fv3_conf/compile_slurm.IN_noaacloud
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/sh
#SBATCH -e err
#SBATCH -o out
#SBATCH --qos=batch
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=@[TPN]
#SBATCH --job-name="@[JBNME]"

set -eux

echo -n " $( date +%s )," > job_timestamp.txt
echo "Compile started: " `date`

@[PATHRT]/compile.sh @[MACHINE_ID] "@[MAKE_OPT]" @[COMPILE_NR]

echo "Compile ended: " `date`
echo -n " $( date +%s )," >> job_timestamp.txt
45 changes: 45 additions & 0 deletions tests/fv3_conf/fv3_slurm.IN_noaacloud
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/sh
#SBATCH -e err
#SBATCH -o out
#SBATCH --qos=batch
### #SBATCH --ntasks=@[TASKS]
#SBATCH --nodes=@[NODES]
#SBATCH --ntasks-per-node=@[TPN]
#SBATCH --job-name="@[JBNME]"
#SBATCH --exclusive

set -eux
echo -n " $( date +%s )," > job_timestamp.txt

set +x
MACHINE_ID=noaacloud
module use $( pwd -P )
module use /contrib/EPIC/spack-stack/spack-stack-1.3.0/envs/unified-dev/install/modulefiles/Core
module load stack-intel/2021.3.0 stack-intel-oneapi-mpi/2021.3.0
module load ufs-weather-model-env/unified-dev
module list

set -x

ulimit -s unlimited
ulimit -l unlimited

echo "Model started: " `date`

#export MPI_TYPE_DEPTH=20
export OMP_STACKSIZE=512M
export KMP_AFFINITY=scatter
export OMP_NUM_THREADS=1
#export ESMF_RUNTIME_COMPLIANCECHECK=OFF:depth=4
#export PSM_RANKS_PER_CONTEXT=4
#export PSM_SHAREDCONTEXTS=1
#export ESMF_RUNTIME_PROFILE=ON
#export ESMF_RUNTIME_PROFILE_OUTPUT="SUMMARY"

# Avoid job errors because of filesystem synchronization delays
sync && sleep 1

srun --mpi=pmi2 --label -n @[TASKS] ./fv3.exe

echo "Model ended: " `date`
echo -n " $( date +%s )," >> job_timestamp.txt
9 changes: 8 additions & 1 deletion tests/module-setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,14 @@ elif [[ $MACHINE_ID = cheyenne* ]] ; then
source /glade/u/apps/ch/modulefiles/default/localinit/localinit.sh
fi
module purge


elif [[ $MACHINE_ID = noaacloud* ]] ; then
# We are on NOAA Cloud
if ( ! eval module help > /dev/null 2>&1 ) ; then
source /apps/lmod/8.5.2/init/bash
fi
module purge

elif [[ $MACHINE_ID = stampede* ]] ; then
# We are on TACC Stampede
if ( ! eval module help > /dev/null 2>&1 ) ; then
Expand Down
26 changes: 26 additions & 0 deletions tests/rt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,29 @@ elif [[ $MACHINE_ID = expanse.* ]]; then
PTMP=$dprefix
SCHEDULER=slurm
cp fv3_conf/fv3_slurm.IN_expanse fv3_conf/fv3_slurm.IN

elif [[ $MACHINE_ID = noaacloud.* ]]; then

module use /apps/modules/modulefiles
module load rocoto/1.3.3

ROCOTORUN=$(which rocotorun)
ROCOTOSTAT=$(which rocotostat)
ROCOTOCOMPLETE=$(which rocotocomplete)
ROCOTO_SCHEDULER=slurm

QUEUE=batch
COMPILE_QUEUE=batch
PARTITION=
dprefix=/lustre/
DISKNM=/contrib/ufs-weather-model/RT
STMP=$dprefix/stmp4
PTMP=$dprefix/stmp2
SCHEDULER=slurm
cp fv3_conf/fv3_slurm.IN_noaacloud fv3_conf/fv3_slurm.IN
cp fv3_conf/compile_slurm.IN_noaacloud fv3_conf/compile_slurm.IN


else
die "Unknown machine ID, please edit detect_machine.sh file"
fi
Expand Down Expand Up @@ -510,6 +532,10 @@ if [[ $ROCOTO == true ]]; then
QUEUE=s4
COMPILE_QUEUE=s4
ROCOTO_SCHEDULER=slurm
elif [[ $MACHINE_ID = noaacloud.* ]]; then
QUEUE=batch
COMPILE_QUEUE=batch
ROCOTO_SCHEDULER=slurm
elif [[ $MACHINE_ID = jet.* ]]; then
QUEUE=batch
COMPILE_QUEUE=batch
Expand Down
4 changes: 2 additions & 2 deletions tests/rt_utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -355,11 +355,11 @@ check_results() {
fi

if [[ $d -eq 1 && ${i##*.} == 'nc' ]] ; then
if [[ ${MACHINE_ID} =~ orion || ${MACHINE_ID} =~ hera || ${MACHINE_ID} =~ wcoss2 || ${MACHINE_ID} =~ acorn || ${MACHINE_ID} =~ cheyenne || ${MACHINE_ID} =~ gaea || ${MACHINE_ID} =~ jet || ${MACHINE_ID} =~ s4 ]] ; then
if [[ ${MACHINE_ID} =~ orion || ${MACHINE_ID} =~ hera || ${MACHINE_ID} =~ wcoss2 || ${MACHINE_ID} =~ acorn || ${MACHINE_ID} =~ cheyenne || ${MACHINE_ID} =~ gaea || ${MACHINE_ID} =~ jet || ${MACHINE_ID} =~ s4 || ${MACHINE_ID} =~ noaacloud ]] ; then
printf ".......ALT CHECK.." >> ${REGRESSIONTEST_LOG}
printf ".......ALT CHECK.."
if [[ ${MACHINE_ID} =~ orion || ${MACHINE_ID} =~ hera || ${MACHINE_ID} =~ gaea || ${MACHINE_ID} =~ jet || ${MACHINE_ID} =~ cheyenne ]] ; then
nccmp -d -f -g --Attribute=checksum --warn=format ${RTPWD}/${CNTL_DIR}/${i} ${RUNDIR}/${i} > ${i}_nccmp.log 2>&1 && d=$? || d=$?
nccmp -d -f -g -B --Attribute=checksum --warn=format ${RTPWD}/${CNTL_DIR}/${i} ${RUNDIR}/${i} > ${i}_nccmp.log 2>&1 && d=$? || d=$?
else
${PATHRT}/compare_ncfile.py ${RTPWD}/${CNTL_DIR}/$i ${RUNDIR}/$i > compare_ncfile.log 2>&1 && d=$? || d=$?
fi
Expand Down

0 comments on commit ee35891

Please sign in to comment.