diff --git a/scripts/config/HPL_128GPU.dat b/scripts/config/HPL_128GPU.dat
index df751ce..61cf18f 100644
--- a/scripts/config/HPL_128GPU.dat
+++ b/scripts/config/HPL_128GPU.dat
@@ -8,8 +8,8 @@ HPL.out      output file name (if any)
 384          NBs
 0            PMAP process mapping (0=Row-,1=Column-major)
 1            # of process grids (P x Q)
-8            Ps
-16           Qs
+16           Ps
+8            Qs
 16.0         threshold
 1            # of panel fact
 2            PFACTs (0=left, 1=Crout, 2=Right)
diff --git a/scripts/config/HPL_2048GPU.dat b/scripts/config/HPL_2048GPU.dat
new file mode 100644
index 0000000..0c38025
--- /dev/null
+++ b/scripts/config/HPL_2048GPU.dat
@@ -0,0 +1,31 @@
+HPLinpack benchmark input file
+Innovative Computing Laboratory, University of Tennessee
+HPL.out      output file name (if any)
+0            device out (6=stdout,7=stderr,file)
+1            # of problems sizes (N)
+4104192      N
+1            # of NBs
+384          NBs
+0            PMAP process mapping (0=Row-,1=Column-major)
+1            # of process grids (P x Q)
+64           Ps
+32           Qs
+16.0         threshold
+1            # of panel fact
+2            PFACTs (0=left, 1=Crout, 2=Right)
+1            # of recursive stopping criterium
+2            NBMINs (>= 1)
+1            # of panels in recursion
+2            NDIVs
+1            # of recursive panel fact.
+2            RFACTs (0=left, 1=Crout, 2=Right)
+1            # of broadcast
+6            BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM,6=ibcast)
+1            # of lookahead depth
+1            DEPTHs (>=0)
+1            SWAP (0=bin-exch,1=long,2=mix)
+64           swapping threshold
+1            L1 in (0=transposed,1=no-transposed) form
+0            U  in (0=transposed,1=no-transposed) form
+0            Equilibration (0=no,1=yes)
+8            memory alignment in double (> 0)
diff --git a/scripts/config/HPL_32GPU.dat b/scripts/config/HPL_32GPU.dat
index 4c13519..cf14244 100644
--- a/scripts/config/HPL_32GPU.dat
+++ b/scripts/config/HPL_32GPU.dat
@@ -8,8 +8,8 @@ HPL.out      output file name (if any)
 384          NBs
 0            PMAP process mapping (0=Row-,1=Column-major)
 1            # of process grids (P x Q)
-4            Ps
-8            Qs
+8            Ps
+4            Qs
 16.0         threshold
 1            # of panel fact
 2            PFACTs (0=left, 1=Crout, 2=Right)
diff --git a/scripts/config/HPL_4096GPU.dat b/scripts/config/HPL_4096GPU.dat
new file mode 100644
index 0000000..1415dda
--- /dev/null
+++ b/scripts/config/HPL_4096GPU.dat
@@ -0,0 +1,31 @@
+HPLinpack benchmark input file
+Innovative Computing Laboratory, University of Tennessee
+HPL.out      output file name (if any)
+0            device out (6=stdout,7=stderr,file)
+1            # of problems sizes (N)
+5787648      N
+1            # of NBs
+384          NBs
+0            PMAP process mapping (0=Row-,1=Column-major)
+1            # of process grids (P x Q)
+64           Ps
+64           Qs
+16.0         threshold
+1            # of panel fact
+2            PFACTs (0=left, 1=Crout, 2=Right)
+1            # of recursive stopping criterium
+2            NBMINs (>= 1)
+1            # of panels in recursion
+2            NDIVs
+1            # of recursive panel fact.
+2            RFACTs (0=left, 1=Crout, 2=Right)
+1            # of broadcast
+6            BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM,6=ibcast)
+1            # of lookahead depth
+1            DEPTHs (>=0)
+1            SWAP (0=bin-exch,1=long,2=mix)
+64           swapping threshold
+1            L1 in (0=transposed,1=no-transposed) form
+0            U  in (0=transposed,1=no-transposed) form
+0            Equilibration (0=no,1=yes)
+8            memory alignment in double (> 0)
diff --git a/scripts/config/HPL_512GPU.dat b/scripts/config/HPL_512GPU.dat
index 5a6e2e9..a85561c 100644
--- a/scripts/config/HPL_512GPU.dat
+++ b/scripts/config/HPL_512GPU.dat
@@ -8,8 +8,8 @@ HPL.out      output file name (if any)
 384          NBs
 0            PMAP process mapping (0=Row-,1=Column-major)
 1            # of process grids (P x Q)
-16           Ps
-32           Qs
+32           Ps
+16           Qs
 16.0         threshold
 1            # of panel fact
 2            PFACTs (0=left, 1=Crout, 2=Right)
diff --git a/scripts/config/HPL_8GPU.dat b/scripts/config/HPL_8GPU.dat
index 531e050..6397f1b 100644
--- a/scripts/config/HPL_8GPU.dat
+++ b/scripts/config/HPL_8GPU.dat
@@ -3,13 +3,13 @@ Innovative Computing Laboratory, University of Tennessee
 HPL.out      output file name (if any)
 0            device out (6=stdout,7=stderr,file)
 1            # of problems sizes (N)
-256000       N
+256512       N
 1            # of NBs
 384          NBs
 0            PMAP process mapping (0=Row-,1=Column-major)
 1            # of process grids (P x Q)
-2            Ps
-4            Qs
+4            Ps
+2            Qs
 16.0         threshold
 1            # of panel fact
 2            PFACTs (0=left, 1=Crout, 2=Right)
diff --git a/scripts/env.lumi.sh b/scripts/env.lumi.sh
index 55085e9..1c4fcee 100644
--- a/scripts/env.lumi.sh
+++ b/scripts/env.lumi.sh
@@ -17,5 +17,5 @@ module load rocm/5.3.0-10584
 # enable GPU aware MPI
 export MPICH_GPU_SUPPORT_ENABLED=1
 # to work around the OFI registration cache issue for > 8 nodes
-export FI_MR_CACHE_MAX_COUNT=0
+#export FI_MR_CACHE_MAX_COUNT=0
 export MPICH_RANK_REORDER_DISPLAY=1
diff --git a/scripts/run_hpl.slurm b/scripts/run_hpl.slurm
index 9d69cb7..381779b 100755
--- a/scripts/run_hpl.slurm
+++ b/scripts/run_hpl.slurm
@@ -3,8 +3,9 @@
 #SBATCH -N 1
 ##SBATCH -n 8
 #SBATCH -c 8
-#SBATCH -t 1:00:00
-#SBATCH -A VEN114
+#SBATCH -t 2:00:00
+##SBATCH -A VEN114
+#SBATCH -A project_462000075
 #SBATCH -J xhplhip
 #SBATCH --gpu-bind=closest
 #SBATCH --ntasks-per-node=8
@@ -35,6 +36,18 @@ CMD+="-o $LOG -e $LOG "
 #CMD+="${HOME}/mpich_bind.sh "
 CMD+="$EXE"
 
+if [ $NODES -gt 8 ]; then
+    echo "export FI_MR_CACHE_MAX_COUNT=0"
+          export FI_MR_CACHE_MAX_COUNT=0
+else
+    echo "unset FI_MR_CACHE_MAX_COUNT"
+          unset FI_MR_CACHE_MAX_COUNT
+fi
+
+#export MPICH_SMP_SINGLE_COPY_MODE=NONE # does not work
+#export FI_MR_CACHE_MAX_COUNT=0 # workaround for failed to allocate memory
+#export MPICH_RANK_REORDER_DISPLAY=1
+
 echo $CMD >> $LOG
 echo $CMD 2>&1 | tee -a $LOG
      $CMD 2>&1 | tee -a $LOG