diff --git a/scripts/config/HPL_128GPU.dat b/scripts/config/HPL_128GPU.dat index df751ce..61cf18f 100644 --- a/scripts/config/HPL_128GPU.dat +++ b/scripts/config/HPL_128GPU.dat @@ -8,8 +8,8 @@ HPL.out output file name (if any) 384 NBs 0 PMAP process mapping (0=Row-,1=Column-major) 1 # of process grids (P x Q) -8 Ps -16 Qs +16 Ps +8 Qs 16.0 threshold 1 # of panel fact 2 PFACTs (0=left, 1=Crout, 2=Right) diff --git a/scripts/config/HPL_2048GPU.dat b/scripts/config/HPL_2048GPU.dat new file mode 100644 index 0000000..0c38025 --- /dev/null +++ b/scripts/config/HPL_2048GPU.dat @@ -0,0 +1,31 @@ +HPLinpack benchmark input file +Innovative Computing Laboratory, University of Tennessee +HPL.out output file name (if any) +0 device out (6=stdout,7=stderr,file) +1 # of problems sizes (N) +4104192 N +1 # of NBs +384 NBs +0 PMAP process mapping (0=Row-,1=Column-major) +1 # of process grids (P x Q) +64 Ps +32 Qs +16.0 threshold +1 # of panel fact +2 PFACTs (0=left, 1=Crout, 2=Right) +1 # of recursive stopping criterium +2 NBMINs (>= 1) +1 # of panels in recursion +2 NDIVs +1 # of recursive panel fact. +2 RFACTs (0=left, 1=Crout, 2=Right) +1 # of broadcast +6 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM,6=ibcast) +1 # of lookahead depth +1 DEPTHs (>=0) +1 SWAP (0=bin-exch,1=long,2=mix) +64 swapping threshold +1 L1 in (0=transposed,1=no-transposed) form +0 U in (0=transposed,1=no-transposed) form +0 Equilibration (0=no,1=yes) +8 memory alignment in double (> 0) diff --git a/scripts/config/HPL_32GPU.dat b/scripts/config/HPL_32GPU.dat index 4c13519..cf14244 100644 --- a/scripts/config/HPL_32GPU.dat +++ b/scripts/config/HPL_32GPU.dat @@ -8,8 +8,8 @@ HPL.out output file name (if any) 384 NBs 0 PMAP process mapping (0=Row-,1=Column-major) 1 # of process grids (P x Q) -4 Ps -8 Qs +8 Ps +4 Qs 16.0 threshold 1 # of panel fact 2 PFACTs (0=left, 1=Crout, 2=Right) diff --git a/scripts/config/HPL_4096GPU.dat b/scripts/config/HPL_4096GPU.dat new file mode 100644 index 0000000..1415dda --- /dev/null +++ b/scripts/config/HPL_4096GPU.dat @@ -0,0 +1,31 @@ +HPLinpack benchmark input file +Innovative Computing Laboratory, University of Tennessee +HPL.out output file name (if any) +0 device out (6=stdout,7=stderr,file) +1 # of problems sizes (N) +5787648 N +1 # of NBs +384 NBs +0 PMAP process mapping (0=Row-,1=Column-major) +1 # of process grids (P x Q) +64 Ps +64 Qs +16.0 threshold +1 # of panel fact +2 PFACTs (0=left, 1=Crout, 2=Right) +1 # of recursive stopping criterium +2 NBMINs (>= 1) +1 # of panels in recursion +2 NDIVs +1 # of recursive panel fact. +2 RFACTs (0=left, 1=Crout, 2=Right) +1 # of broadcast +6 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM,6=ibcast) +1 # of lookahead depth +1 DEPTHs (>=0) +1 SWAP (0=bin-exch,1=long,2=mix) +64 swapping threshold +1 L1 in (0=transposed,1=no-transposed) form +0 U in (0=transposed,1=no-transposed) form +0 Equilibration (0=no,1=yes) +8 memory alignment in double (> 0) diff --git a/scripts/config/HPL_512GPU.dat b/scripts/config/HPL_512GPU.dat index 5a6e2e9..a85561c 100644 --- a/scripts/config/HPL_512GPU.dat +++ b/scripts/config/HPL_512GPU.dat @@ -8,8 +8,8 @@ HPL.out output file name (if any) 384 NBs 0 PMAP process mapping (0=Row-,1=Column-major) 1 # of process grids (P x Q) -16 Ps -32 Qs +32 Ps +16 Qs 16.0 threshold 1 # of panel fact 2 PFACTs (0=left, 1=Crout, 2=Right) diff --git a/scripts/config/HPL_8GPU.dat b/scripts/config/HPL_8GPU.dat index 531e050..6397f1b 100644 --- a/scripts/config/HPL_8GPU.dat +++ b/scripts/config/HPL_8GPU.dat @@ -3,13 +3,13 @@ Innovative Computing Laboratory, University of Tennessee HPL.out output file name (if any) 0 device out (6=stdout,7=stderr,file) 1 # of problems sizes (N) -256000 N +256512 N 1 # of NBs 384 NBs 0 PMAP process mapping (0=Row-,1=Column-major) 1 # of process grids (P x Q) -2 Ps -4 Qs +4 Ps +2 Qs 16.0 threshold 1 # of panel fact 2 PFACTs (0=left, 1=Crout, 2=Right) diff --git a/scripts/env.lumi.sh b/scripts/env.lumi.sh index 55085e9..1c4fcee 100644 --- a/scripts/env.lumi.sh +++ b/scripts/env.lumi.sh @@ -17,5 +17,5 @@ module load rocm/5.3.0-10584 # enable GPU aware MPI export MPICH_GPU_SUPPORT_ENABLED=1 # to work around the OFI registration cache issue for > 8 nodes -export FI_MR_CACHE_MAX_COUNT=0 +#export FI_MR_CACHE_MAX_COUNT=0 export MPICH_RANK_REORDER_DISPLAY=1 diff --git a/scripts/run_hpl.slurm b/scripts/run_hpl.slurm index 9d69cb7..381779b 100755 --- a/scripts/run_hpl.slurm +++ b/scripts/run_hpl.slurm @@ -3,8 +3,9 @@ #SBATCH -N 1 ##SBATCH -n 8 #SBATCH -c 8 -#SBATCH -t 1:00:00 -#SBATCH -A VEN114 +#SBATCH -t 2:00:00 +##SBATCH -A VEN114 +#SBATCH -A project_462000075 #SBATCH -J xhplhip #SBATCH --gpu-bind=closest #SBATCH --ntasks-per-node=8 @@ -35,6 +36,18 @@ CMD+="-o $LOG -e $LOG " #CMD+="${HOME}/mpich_bind.sh " CMD+="$EXE" +if [ $NODES -gt 8 ]; then + echo "export FI_MR_CACHE_MAX_COUNT=0" + export FI_MR_CACHE_MAX_COUNT=0 +else + echo "unset FI_MR_CACHE_MAX_COUNT" + unset FI_MR_CACHE_MAX_COUNT +fi + +#export MPICH_SMP_SINGLE_COPY_MODE=NONE # does not work +#export FI_MR_CACHE_MAX_COUNT=0 # workaround for failed to allocate memory +#export MPICH_RANK_REORDER_DISPLAY=1 + echo $CMD >> $LOG echo $CMD 2>&1 | tee -a $LOG $CMD 2>&1 | tee -a $LOG