-
Notifications
You must be signed in to change notification settings - Fork 170
/
Copy pathintegrationTestExecute.sh
executable file
·362 lines (293 loc) · 11.9 KB
/
integrationTestExecute.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
#!/bin/bash
#SBATCH -n 24
#SBATCH -N 1
#SBATCH -c 1
#SBATCH -t 0-5:00
#SBATCH -p REQUESTED_PARTITION
#SBATCH --mem=150000
#SBATCH --mail-type=END
#BSUB -q REQUESTED_PARTITION
#BSUB -n 24
#BSUB -W 5:00
#BSUB -R "rusage[mem=90GB] span[ptile=1] select[mem < 2TB]"
#BSUB -a 'docker(registry.gsc.wustl.edu/sleong/esm:intel-2021.1.2)'
#BSUB -o lsf-%J.txt
#------------------------------------------------------------------------------
# GEOS-Chem Global Chemical Transport Model !
#------------------------------------------------------------------------------
#BOP
#
# !MODULE: integrationTestExecute.sh
#
# !DESCRIPTION: Runs execution tests on various GEOS-Chem Classic
# run directories (interactively or using a scheduler)
#\\
#\\
# !CALLING SEQUENCE:
# ./integrationTestExecute.sh # Interactive command-line execution
# bsub integrationTestExecute.sh # Execution via LSF
# sbatch integrationTestExecute.sh # Execution via SLURM
#EOP
#------------------------------------------------------------------------------
#BOC
#============================================================================
# Global variable and function definitions
#============================================================================
# This script starts executing 1 level lower than $itRoot
itRoot=$(cd ..; pwd)
# Include global variables & functions
. "${itRoot}/scripts/commonFunctionsForTests.sh"
# Create local convenience variables
binDir="${itRoot}/${BIN_DIR}"
envDir="${itRoot}/${ENV_DIR}"
codeDir="${itRoot}/CodeDir"
logsDir="${itRoot}/${LOGS_DIR}"
rundirsDir="${itRoot}/${RUNDIRS_DIR}"
site=$(get_site_name)
# Load the environment and the software environment
. ~/.bashrc > /dev/null 2>&1
[[ "X${site}" == "XCANNON" ]] && . ${envDir}/gchp.env > /dev/null 2>&1
# Site-specific settings
if [[ "X${site}" == "XCANNON" && "X${SLURM_JOBID}" != "X" ]]; then
#----------------------------------
# SLURM settings (Harvard Cannon)
#----------------------------------
# Set OMP_NUM_THREADS to the same # of cores requested with #SBATCH -c
export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}
elif [[ "X${site}" == "XCOMPUTE1" && "X${LSB_JOBID}" != "X" ]]; then
#---------------------------------
# LSF settings (WashU Compute1)
#---------------------------------
# Set OMP_NUM_THREADS to the same # of cores requested with #BSUB -n
export OMP_NUM_THREADS=${LSB_DJOB_NUMPROC}
# Unlimit resources to prevent OS killing GCHP due to resource usage/
# Alternatively you can put this in your environment file.
ulimit -c 0 # coredumpsize
ulimit -l unlimited # memorylocked
ulimit -u 50000 # maxproc
ulimit -v unlimited # vmemoryuse
ulimit -s unlimited # stacksize
else
#---------------------------------
# Interactive settings
#---------------------------------
echo ""
echo "Execution tests running..."
# For AWS, set $OMP_NUM_THREADS to the available cores
kernel=$(uname -r)
[[ "X${kernel}" == "Xaws" ]] && export OMP_NUM_THREADS=$(nproc)
fi
# Sanity check: Set OMP_NUM_THREADS to 6 if it is not set
# (this may happen when running interactively)
[[ "x${OMP_NUM_THREADS}" == "x" ]] && export OMP_NUM_THREADS=6
# Sanity check: Max out the OMP_STACKSIZE if it is not set
[[ "x${OMP_STACKSIZE}" == "x" ]] && export OMP_STACKSIZE=500m
# Count the number of tests to be run (same as the # of run directories)
numTests=$(count_rundirs "${rundirsDir}")
#============================================================================
# Initialize results logfile
#============================================================================
# Results logfile name
results="${logsDir}/results.execute.log"
rm -f "${results}"
# Print header to results log file
print_to_log "${SEP_MAJOR}" "${results}"
print_to_log "GCHP: Execution Test Results" "${results}"
print_to_log "" "${results}"
print_submodule_head_commits "14" "${codeDir}" "${results}"
print_to_log "" "${results}"
print_to_log "Number of execution tests: ${numTests}" "${results}"
print_to_log "" "${results}"
if [[ "X${SLURM_JOBID}" != "X" ]]; then
print_to_log "Submitted as SLURM job: ${SLURM_JOBID}" "${results}"
elif [[ "X${LSB_JOBID}" == "XCOMPUTE1" ]]; then
print_to_log "Submitted as LSF job: ${LSB_JOBID}" "${results}"
else
print_to_log "Submitted as interactive job" "${results}"
fi
print_to_log "${SEP_MAJOR}" "${results}"
#============================================================================
# Run the GEOS-Chem executable in each GEOS-Chem run directory
#============================================================================
print_to_log " " "${results}"
print_to_log "Execution tests:" "${results}"
print_to_log "${SEP_MINOR}" "${results}"
# Keep track of the number of tests that passed & failed
let passed=0
let failed=0
let remain=${numTests}
# Navigate to the directory containing individiual run directories
cd "${rundirsDir}"
# Loop over rundirs and run GEOS-Chem
for runDir in *; do
# Expand rundir to absolute path
runAbsPath="${rundirsDir}/${runDir}"
# Do the following if for only valid GCHP run dirs
expr=$(is_gchp_rundir "${runAbsPath}")
if [[ "x${expr}" == "xTRUE" ]]; then
# Define log file
log="${logsDir}/execute.${runDir}.log"
rm -f "${log}"
# Messages for execution pass & fail
passMsg="$runDir${FILL:${#runDir}}.....${EXE_PASS_STR}"
failMsg="$runDir${FILL:${#runDir}}.....${EXE_FAIL_STR}"
# Get the executable file corresponding to this run directory
exeFile=$(exe_name "gchp" "${runDir}")
# Test if the executable exists
if [[ -f "${binDir}/${exeFile}" ]]; then
#----------------------------------------------------------------
# If the executable file exists, we can do the test
#----------------------------------------------------------------
# Change to the run directory
cd "${runAbsPath}"
# Copy the executable file here
cp "${binDir}/${exeFile}" .
# Update to make sure the run directory is executable
# on Compute1. We will later replace this test with
# a test on the site name instead of on the scheduler.
# TODO: Test on name rather than scheduler
if [[ "X${site}" == "XCOMPUTE1" ]]; then
chmod 755 -R "${runAbsPath}"
fi
# Remove any leftover files in the run dir
./cleanRunDir.sh --no-interactive >> "${log}" 2>&1
# Also reset cap_restart to 00:00:00 UTC,
# in case we are restarting the tests maually
sed_ie 's/ ....00/ 000000/g' cap_restart
# Link to the environment file
./setEnvironmentLink.sh "${envDir}/gchp.env"
# Update config files, set links, load environment, sanity checks
. setCommonRunSettings.sh >> "${log}" 2>&1
. setRestartLink.sh >> "${log}" 2>&1
. gchp.env >> "${log}" 2>&1
. checkRunSettings.sh >> "${log}" 2>&1
# For safety's sake, remove restarts that weren't renamed
for rst in Restarts; do
if [[ "${rst}" =~ "gcchem_internal_checkpoint" ]]; then
rm -f "${rst}"
fi
done
# Run GCHP and evenly distribute tasks across nodes
if [[ "X${site}" == "XCANNON" && "X${SLURM_JOBID}" != "X" ]]; then
#---------------------------------------------
# Executing GCHP on SLURM (Harvard Cannon)
#---------------------------------------------
# Compute parameters for srun
# See the gchp.run script in the folder:
# runScriptSamples/operational_examples/harvard_cannon
NX=$(grep NX GCHP.rc | awk '{print $2}')
NY=$(grep NY GCHP.rc | awk '{print $2}')
coreCt=$(( ${NX} * ${NY} ))
planeCt=$(( ${coreCt} / ${SLURM_NNODES} ))
if [[ $(( ${coreCt} % ${SLURM_NNODES} )) > 0 ]]; then
planeCt=$(( ${planeCt} + 1 ))
fi
# Execute GCHP with srun
srun -n ${coreCt} -N ${SLURM_NNODES} -m plane=${planeCt} \
--mpi=pmix ./${exeFile} >> "${log}" 2>&1
elif [[ "X${scheduler}" == "xLSF" && "X${LSB_JOBID}" != "X" ]]; then
#---------------------------------------------
# Executing GCHP on LSF (WashU Compute1)
#---------------------------------------------
mpiexec -n 24 ./${exeFile} > "${log}" 2>&1
else
#---------------------------------------------
# Executing GCHP interactively
#---------------------------------------------
mpirun -n 24 ./${exeFile} >> "${log}" 2>&1
fi
# Update pass/failed counts and write to results.log
if [[ $? -eq 0 ]]; then
# The run passed ...
let passed++
print_to_log "${passMsg}" "${results}"
# ... so also rename the end-of-run restart file
new_start_str=$(sed 's/ /_/g' cap_restart)
N=$(grep "CS_RES=" setCommonRunSettings.sh | cut -c 8- | xargs )
mv Restarts/gcchem_internal_checkpoint \
Restarts/GEOSChem.Restart.${new_start_str:0:13}z.c${N}.nc4
else
# The run failed
let failed++
print_to_log "${failMsg}" "${results}"
fi
# Change to root directory for next iteration
cd "${rundirsDir}"
else
#----------------------------------------------------------------
# If the executable is missing, update the "fail" counter
# and write the "failed" message to the results log file.
#----------------------------------------------------------------
let failed++
print_to_log "${failMsg}" "${results}"
fi
# Decrement the count of remaining tests
let remain--
fi
done
#============================================================================
# Check the number of simulations that have passed
#============================================================================
# Print summary to log
print_to_log " " ${results}
print_to_log "Summary of test results:" ${results}
print_to_log "${SEP_MINOR}" ${results}
print_to_log "Execution tests passed: ${passed}" ${results}
print_to_log "Execution tests failed: ${failed}" ${results}
print_to_log "Execution tests not yet completed: ${remain}" ${results}
# Check if all tests passed
if [[ "X${passed}" == "X${numTests}" ]]; then
print_to_log "" ${results}
print_to_log "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%" ${results}
print_to_log "%%% All execution tests passed! %%%" ${results}
print_to_log "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%" ${results}
# Print success (if interactive)
if [[ "X${SLURM_JOBID}" == "X" && "X${LSB_JOBID}" == "X" ]]; then
echo ""
echo "Execution tests finished!"
fi
else
#--------------------------
# Unsuccessful execution
#--------------------------
if [[ "X${SLURM_JOBID}" == "X" && "X${LSB_JOBID}" == "X" ]]; then
echo ""
echo "Execution tests failed! Exiting ..."
fi
fi
#============================================================================
# Cleanup and quit
#============================================================================
# Free local variables
unset absRunPath
unset binDir
unset codeDir
unset envDir
unset coreCt
unset exeFile
unset failed
unset failmsg
unset head_gchp
unset head_gc
unset head_hco
unset itRoot
unset log
unset logsDir
unset numTests
unset NX
unset NY
unset passed
unset passMsg
unset planeCt
unset remain
unset results
unset rundirsDir
unset scheduler
# Free imported global variables
unset FILL
unset LINE
unset CMP_PASS_STR
unset CMP_FAIL_STR
unset EXE_PASS_STR
unset EXE_FAIL_STR
#EOC