From b9115429a6ddac21acd4e5c01f642d922def7dd4 Mon Sep 17 00:00:00 2001
From: Redtorm <songxiang.zhu@foxmail.com>
Date: Fri, 9 Sep 2022 10:48:36 +0200
Subject: [PATCH 1/3] optimize the one-node performance

---
 CMakeLists.txt                         |    6 +-
 HPL.dat                                |    6 +-
 include/backend/hpl_backendHIP.h       |    6 +-
 include/backend/hpl_backendWrapper.h   |    2 +-
 include/hpl_grid.h                     |    6 +
 include/hpl_panel.h                    |    1 +
 include/hpl_pgesv.h                    |    1 +
 include/hpl_ptest.h                    |    4 +
 scripts/mpirun_xhplhip.sh              |    2 +-
 scripts/run_xhplhip.sh                 |  206 ++-
 src/grid/HPL_grid_init.cpp             |   20 +-
 src/pgesv/HPL_pdgesvK2_HIP.cpp         |    2 +-
 testing/backend/HPL_backendHIP.cpp     | 1731 ++++++++++++------------
 testing/backend/HPL_backendWrapper.cpp |    4 +-
 testing/ptest/HPL_pddriver.cpp         |   19 +-
 testing/ptest/HPL_pdinfo.cpp           |   21 +-
 testing/ptest/HPL_pdtest.cpp           |    2 +-
 17 files changed, 1112 insertions(+), 927 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 930e8f0..46097de 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,7 +21,7 @@ find_package(OpenMP)
 
 # Add rocM root dir to CMAKE_PREFIX_PATH, usually /opt/rocm
 list(APPEND CMAKE_PREFIX_PATH "/opt/rocm")
-include(/opt/rocm/hip/cmake/FindHIP.cmake)
+include(/opt/rocm/lib/cmake/hip/FindHIP.cmake)
 include(/opt/rocm/share/rocm/cmake/ROCMCheckTargetIds.cmake)
 find_package( hip REQUIRED )
 find_package( rocblas REQUIRED )
@@ -212,12 +212,12 @@ hip_add_executable( xhplhip ${hpl_device_source} ${hpl_host_source})
 
 target_compile_options(xhplhip PRIVATE ${CMAKE_HOST_FLAGS})
 target_include_directories( xhplhip PUBLIC hip:device
-    ${HIP_ROOT_DIR}/include
+    # ${HIP_ROOT_DIR}/include
     ${HPLHIP_DEVRAND_INCLUDE_DIRS}
     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
     $<INSTALL_INTERFACE:include>
 )
-# target_link_libraries( xhplhip roc::rocblas roc::rocrand ${BLAS_LIBRARIES} ${MPI_CXX_LIBRARIES} /global/home/lulu/mun-node-3/blis-multi-thread/lib/zen3/libblis.so)
+# target_link_libraries( xhplhip roc::rocblas roc::rocrand ${BLAS_LIBRARIES} ${MPI_CXX_LIBRARIES} OpenMP::OpenMP_CXX /global/home/lulu/mun-node-3/blis-multi-thread/lib/zen3/libblis.so)
 target_link_libraries( xhplhip roc::rocblas roc::rocrand ${BLAS_LIBRARIES} ${MPI_CXX_LIBRARIES} OpenMP::OpenMP_CXX)
 
 configure_file( include/hplhip_config.hin ${CMAKE_CURRENT_SOURCE_DIR}/include/hplhip_config.h @ONLY NEWLINE_STYLE LF )
diff --git a/HPL.dat b/HPL.dat
index 71bd3c2..3f8575c 100755
--- a/HPL.dat
+++ b/HPL.dat
@@ -6,10 +6,10 @@ HPL.out      output file name (if any)
 256000        N
 1            # of NBs
 384          NBs
-0            PMAP process mapping (0=Row-,1=Column-major)
+1            PMAP process mapping (0=Row-,1=Column-major)
 1            # of process grids (P x Q)
-2            Ps
-4            Qs
+4            Ps
+2            Qs
 16.0         threshold
 1            # of panel fact
 2            PFACTs (0=left, 1=Crout, 2=Right)
diff --git a/include/backend/hpl_backendHIP.h b/include/backend/hpl_backendHIP.h
index 6c75ada..1df8ed2 100644
--- a/include/backend/hpl_backendHIP.h
+++ b/include/backend/hpl_backendHIP.h
@@ -2,9 +2,9 @@
 
 #include <hip/hip_runtime.h>
 #if defined(HPLHIP_USE_ROCRAND)
-#include <rocrand.h>
+#include <rocrand/rocrand.h>
 #endif
-#include <rocblas.h>
+#include <rocblas/rocblas.h>
 
 #include <cstdio>
 #include <cstdlib>
@@ -81,7 +81,7 @@ enum SWP_PHASE {
 };
 
 namespace HIP {
-    void init(size_t);
+    void init(const HPL_T_grid*);
     void release();
 
     void malloc(void**, size_t);
diff --git a/include/backend/hpl_backendWrapper.h b/include/backend/hpl_backendWrapper.h
index 785a793..206aeac 100644
--- a/include/backend/hpl_backendWrapper.h
+++ b/include/backend/hpl_backendWrapper.h
@@ -26,7 +26,7 @@ UPDATE_LOOK_AHEAD, L1TRANSFER, L2TRANSFER, DGEMMSTART, DGEMMSTOP, UPDATE, SWAPST
 enum HPL_STREAM {HPL_COMPUTESTREAM, HPL_DATASTREAM, HPL_PDLASWPSTREAM};
 
 
-void HPL_BE_init(size_t, enum HPL_TARGET);
+void HPL_BE_init(const HPL_T_grid*, enum HPL_TARGET);
 
 void HPL_BE_malloc(void**, size_t, enum HPL_TARGET);
 
diff --git a/include/hpl_grid.h b/include/hpl_grid.h
index 1895a5e..3c1397f 100644
--- a/include/hpl_grid.h
+++ b/include/hpl_grid.h
@@ -84,6 +84,10 @@ typedef struct HPL_S_grid
    int             col_hdim;     /* col_ip2 procs hypercube dimension */
    int             col_ip2m1;      /* largest power of two <= npcol-1 */
    int             col_mask;        /* col_ip2m1 procs hypercube mask */
+   int             local_myrow;
+   int             local_mycol; 
+   int             local_nprow; 
+   int             local_npcol; 
 } HPL_T_grid;
 
 /*
@@ -132,6 +136,8 @@ STDC_ARGS( (
    const HPL_T_ORDER,
    const int,
    const int,
+   const int,
+   const int,
    HPL_T_grid *
 ) );
 int                              HPL_grid_exit
diff --git a/include/hpl_panel.h b/include/hpl_panel.h
index fffb496..b3c12e5 100644
--- a/include/hpl_panel.h
+++ b/include/hpl_panel.h
@@ -125,6 +125,7 @@ typedef struct HPL_S_panel
    int                dlda;
    double*            U2;          /* ptr to U2 */
    double*            dU2;
+   int                dldl1;
    int                dldl2;
    int                ldu1;
    int                ldu2;
diff --git a/include/hpl_pgesv.h b/include/hpl_pgesv.h
index 2acf3cc..6d251eb 100644
--- a/include/hpl_pgesv.h
+++ b/include/hpl_pgesv.h
@@ -121,6 +121,7 @@ typedef struct HPL_S_pmat
    double              * d_X;             /* device pointer to solution vector */
    double              * W;
    double              * dW;
+   int                 dN;
 } HPL_T_pmat;
 /*
  * ---------------------------------------------------------------------
diff --git a/include/hpl_ptest.h b/include/hpl_ptest.h
index 5777bd5..1f13e5d 100644
--- a/include/hpl_ptest.h
+++ b/include/hpl_ptest.h
@@ -108,6 +108,8 @@ typedef struct HPL_S_test
  */
 void                             HPL_pdinfo
 STDC_ARGS( (
+   int,
+   char**,
    HPL_T_test *,
    int *,
    int *,
@@ -118,6 +120,8 @@ STDC_ARGS( (
    int *,
    int *,
    int *,
+   int *,
+   int *,
    HPL_T_FACT *,
    int *,
    int *,
diff --git a/scripts/mpirun_xhplhip.sh b/scripts/mpirun_xhplhip.sh
index e096823..617143a 100644
--- a/scripts/mpirun_xhplhip.sh
+++ b/scripts/mpirun_xhplhip.sh
@@ -20,7 +20,7 @@ total_cpu_cores=$(($num_cpu_cores*$num_cpu_sockets))
 export LD_LIBRARY_PATH=${mpi_lib}:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
 #Default MPI options
-mpi_args="--map-by slot:PE=${total_cpu_cores} --bind-to core:overload-allowed --mca btl ^openib --mca pml ucx --report-bindings -x LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib ${mpi_args}"
+mpi_args="--map-by slot:PE=${total_cpu_cores} --bind-to core:overload-allowed --mca btl ^openib --mca pml ucx -x LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib ${mpi_args}"
 
 ${mpi_bin} --allow-run-as-root -np ${np} ${mpi_args} ${hpl_runscript} 
 # ${mpi_bin} --hostfile hostfile --allow-run-as-root -np ${np} ${mpi_args} ${hpl_runscript} 
diff --git a/scripts/run_xhplhip.sh b/scripts/run_xhplhip.sh
index 777801f..ea4d314 100755
--- a/scripts/run_xhplhip.sh
+++ b/scripts/run_xhplhip.sh
@@ -2,11 +2,12 @@
 # set -x #echo on
 
 hpl_bin=./xhplhip
-rocblas_dir=/opt/rocm/rocblas/lib
-blas_dir=$HOME/OpenBLAS
+rocblas_dir=/opt/rocm-5.2.0/lib
+blas_dir=/global/home/lulu/hyc/rocHPL-main/tpl/blis/lib
 
 filename=./HPL.dat
-
+p=-1
+q=-1
 export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:$LD_LIBRARY_PATH
 
 oversubscribe=true
@@ -15,12 +16,15 @@ P=$(sed -n "11, 1p" ${filename} | awk '{print $1}')
 Q=$(sed -n "12, 1p" ${filename} | awk '{print $1}')
 np=$(($P*$Q))
 
-# Get local process numbering
 set +u
 if [[ -n ${OMPI_COMM_WORLD_LOCAL_RANK+x} ]]; then
+  globalRank=$OMPI_COMM_WORLD_RANK
+  globalSize=$OMPI_COMM_WORLD_SIZE
   rank=$OMPI_COMM_WORLD_LOCAL_RANK
   size=$OMPI_COMM_WORLD_LOCAL_SIZE
 elif [[ -n ${SLURM_LOCALID+x} ]]; then
+  globalRank=$SLURM_PROCID
+  globalSize=$SLURM_NTASKS
   rank=$SLURM_LOCALID
   size=$SLURM_TASKS_PER_NODE
   #Slurm can return a string like "2(x2),1". Get the first number
@@ -28,69 +32,201 @@ elif [[ -n ${SLURM_LOCALID+x} ]]; then
 fi
 set -u
 
-# count the number of physical cores on node
-num_cpu_cores=$(lscpu | grep "Core(s)" | awk '{print $4}')
-num_cpu_sockets=$(lscpu | grep Socket | awk '{print $2}')
-total_cpu_cores=$(($num_cpu_cores*$num_cpu_sockets))
+#Determing node-local grid size
+if [[ "$p" -lt 1 && "$q" -lt 1 ]]; then
+  # no node-local grid was specified, pick defaults
+  q=$(( (Q<=size) ? Q : size))
+
+  if [[ $((size % q)) -gt 0 ]]; then
+    echo "Invalid MPI grid parameters; Unable to form node-local grid; aborting";
+    exit 1
+  fi
+
+  p=$(( size/q ))
+
+elif [[ "$p" -lt 1 ]]; then
+  #q was specified
+
+  if [[ $((size % q)) -gt 0 ]]; then
+    echo "Invalid MPI grid parameters; Unable to form node-local grid; aborting";
+    exit 1
+  fi
+
+  p=$(( size/q ))
+
+elif [[ "$q" -lt 1 ]]; then
+  #p was specified
 
-# We assume a row-major process mapping to nodes
-columns_per_node=$(( Q < size ? Q : size ))
+  if [[ $((size % p)) -gt 0 ]]; then
+    echo "Invalid MPI grid parameters; Unable to form node-local grid; aborting";
+    exit 1
+  fi
+
+  q=$(( size/p ))
+
+else
+  #Both p and q were specified
+  if [[ $size -ne $((p*q)) ]]; then
+    echo "Invalid MPI grid parameters; Unable to form node-local grid; aborting";
+    exit 1
+  fi
+fi
 
 # Check that the columns are evenly divided among nodes
-if [[ $((Q % columns_per_node)) -gt 0 ]]; then
-  echo "Invalid MPI grid parameters; Must have the same number of Q columns on every node; aborting";
+if [[ $((P % p)) -gt 0 ]]; then
+  echo "Invalid MPI grid parameters; Must have the same number of P rows on every node; aborting";
   exit 1
 fi
 
 # Check that the rows are evenly divided among nodes
-if [[ $((size % columns_per_node)) -gt 0 ]]; then
-  echo "Invalid MPI grid parameters; Must have the same number of P rows on every node; aborting";
+if [[ $((Q % q)) -gt 0 ]]; then
+  echo "Invalid MPI grid parameters; Must have the same number of Q columns on every node; aborting";
   exit 1
 fi
 
-rows_per_node=$(( size/columns_per_node ))
+# count the number of physical cores on node
+num_cpu_cores=$(lscpu | grep "Core(s)" | awk '{print $4}')
+num_cpu_sockets=$(lscpu | grep Socket | awk '{print $2}')
+total_cpu_cores=$(($num_cpu_cores*$num_cpu_sockets))
 
 # Ranks in different processes rows will take distinct chunks of cores
-row_stride=$((total_cpu_cores/rows_per_node))
-col_stride=$((row_stride/columns_per_node))
+row_stride=$((total_cpu_cores/p))
+col_stride=$((row_stride/q))
 
-myP=$((rank/columns_per_node))
-myQ=$((rank%columns_per_node))
+myp=$((rank%p))
+myq=$((rank/p))
 
-# Default core mapping
+#Although ranks are column-major order, we select GPUs in row-major order on node
+mygpu=$((myq+myp*q))
 
-root_core=$((myP*row_stride + myQ*col_stride))
+# Try to detect special Bard-peak core mapping
+if [[ -n ${HPL_PLATFORM+x} ]]; then
+  platform=$HPL_PLATFORM
+else
+  platform=$(cat /sys/class/dmi/id/product_name)
+fi
 
-omp_num_threads=${col_stride}
-# First omp place is the root core
-omp_places="{$root_core}"
+if [[ "$platform" == "BardPeak" || "$platform" == "HPE_CRAY_EX235A" ]]; then
+  # Special core mapping for BardPeak
+
+  # Debug
+  # if [[ $globalRank == 0 ]]; then
+  #   echo "BardPeak platform detected"
+  # fi
+
+  # Sanity check
+  if [[ $size -gt 8 ]]; then
+    echo "Unsupported number of ranks on BardPeak platform; aborting";
+    exit 1
+  fi
+
+  # GCD0 cores="48-55"
+  # GCD1 cores="56-63"
+  # GCD2 cores="16-23"
+  # GCD3 cores="24-31"
+  # GCD4 cores="0-7"
+  # GCD5 cores="8-15"
+  # GCD6 cores="32-39"
+  # GCD7 cores="40-47"
+
+  root_cores=(48 56 16 24 0 8 32 40)
+  root_core=${root_cores[mygpu]}
+
+  # First omp place is the root core
+  omp_places="{$root_core}"
+
+  # First assign the CCD
+  for i in $(seq $((root_core+1)) $((root_core+8-1)))
+  do
+    omp_places+=",{$i}"
+  done
+  omp_num_threads=8
+
+  places="{$root_core-$((root_core+7))}"
+
+  # Loop through unassigned CCDs
+  for c in $(seq $((mygpu+size)) $size 7)
+  do
+    iroot_core=${root_cores[c]}
+    for i in $(seq $((iroot_core)) $((iroot_core+8-1)))
+    do
+      omp_places+=",{$i}"
+    done
+    omp_num_threads=$((omp_num_threads+8))
+    places+=",{$iroot_core-$((iroot_core+7))}"
+  done
 
-# Make contiuguous chunk of cores (to maximize L1/L2 locality)
-for i in $(seq $((root_core+1)) $((root_core+col_stride-1)))
-do
-  omp_places+=",{$i}"
-done
+  if [[ "${oversubscribe}" == true ]]; then
+    # Add cores from different columns, without their root cores
+    for j in $(seq 0  $((q-1)))
+    do
+      if [[ "$j" == "$myq" ]]; then
+        continue
+      fi
+      for jj in $(seq 0 $size 7)
+      do
+        q_gpu=$((jj+j+myp*q))
+        q_core=$((root_cores[q_gpu]))
+        offset=$(( (q_gpu>=size) ? 0 : 1))
+        for i in $(seq $((q_core+offset)) $((q_core+8-1)))
+        do
+          omp_places+=",{$i}"
+        done
+        omp_num_threads=$((omp_num_threads+8-offset))
+        places+=",{$((q_core+offset))-$((q_core+7))}"
+      done
+    done
+  fi
 
-if [[ "${oversubscribe}" == true ]]; then
+else
+  # Default core mapping
+  root_core=$((myp*row_stride + myq*col_stride))
+
+  omp_num_threads=${col_stride}
+  # First omp place is the root core
+  omp_places="{$root_core}"
+
+  # Make contiuguous chunk of cores (to maximize L1/L2 locality)
+  for i in $(seq $((root_core+1)) $((root_core+col_stride-1)))
+  do
+    omp_places+=",{$i}"
+  done
+
+  if [[ $col_stride -gt 1 ]]; then
+    places="{$root_core-$((root_core+col_stride-1))}"
+  else
+    places="{$root_core}"
+  fi
+
+  if [[ "${oversubscribe}" == true ]]; then
     # Add cores from different columns, without their root cores
-    for q in $(seq 0 $((columns_per_node-1)))
+    for j in $(seq 0 $((q-1)))
     do
-      if [[ "$q" == "$myQ" ]]; then
+      if [[ "$j" == "$myq" ]]; then
         continue
       fi
-      q_core=$((myP*row_stride + q*col_stride))
+      q_core=$((myp*row_stride + j*col_stride))
       for i in $(seq $((q_core+1)) $((q_core+col_stride-1)))
       do
         omp_places+=",{$i}"
       done
       omp_num_threads=$((omp_num_threads+col_stride-1))
+
+      if [[ $col_stride -gt 2 ]]; then
+        places+=",{$((q_core+1))-$((q_core+col_stride-1))}"
+      elif [[ $col_stride -gt 1 ]]; then
+        places+=",{$((q_core+1))}"
+      fi
+
     done
+  fi
 fi
-
 # Export OpenMP config
 export OMP_NUM_THREADS=${omp_num_threads}
 export OMP_PLACES=${omp_places}
 export OMP_PROC_BIND=true
-
+if [[ $globalRank -lt $size ]]; then
+  echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] CPU Cores: $omp_num_threads - $places"
+fi
 #run
 ${hpl_bin} 
diff --git a/src/grid/HPL_grid_init.cpp b/src/grid/HPL_grid_init.cpp
index 52111ac..2130c0a 100644
--- a/src/grid/HPL_grid_init.cpp
+++ b/src/grid/HPL_grid_init.cpp
@@ -56,6 +56,8 @@ int HPL_grid_init
    const HPL_T_ORDER                ORDER,
    const int                        NPROW,
    const int                        NPCOL,
+   const int                        p,
+   const int                        q,
    HPL_T_grid *                     GRID
 )
 #else
@@ -65,6 +67,8 @@ int HPL_grid_init
    const HPL_T_ORDER                ORDER;
    const int                        NPROW;
    const int                        NPCOL;
+   const int                        p,
+   const int                        q,
    HPL_T_grid *                     GRID;
 #endif
 {
@@ -111,7 +115,8 @@ int HPL_grid_init
  * .. Local Variables ..
  */
    int                        hdim, hplerr=MPI_SUCCESS, ierr, ip2, k,
-                              mask, mycol, myrow, nprocs, rank, size;
+                              mask, mycol, myrow, nprocs, rank, size,
+                              local_myrow, local_mycol;
 /* ..
  * .. Executable Statements ..
  */
@@ -125,18 +130,29 @@ int HPL_grid_init
 /*
  * Row- or column-major ordering of the processes
  */
+   int local_size = p * q;
+   int local_rank = rank % local_size;
+   int node       = rank / local_size; // node number
+
    if( ORDER == HPL_ROW_MAJOR )
    {
       GRID->order = HPL_ROW_MAJOR;
+      local_mycol = local_rank % q; local_myrow = local_rank / q;
+      int noderow = node / (NPCOL / q); int nodecol = node % (NPCOL / q);
+      myrow = noderow * p + local_myrow; mycol = nodecol * q + local_mycol;
       myrow = rank / NPCOL; mycol = rank - myrow * NPCOL;
    }
    else
    {
       GRID->order = HPL_COLUMN_MAJOR;
-      mycol = rank / NPROW; myrow = rank - mycol * NPROW;
+      local_mycol = local_rank / p; local_myrow = local_rank % p;
+      int noderow = node % (NPROW / p); int nodecol = node / (NPROW / p);
+      myrow = noderow * p + local_myrow; mycol = nodecol * q + local_mycol;
    }
    GRID->iam   = rank;  GRID->myrow = myrow; GRID->mycol  = mycol;
    GRID->nprow = NPROW; GRID->npcol = NPCOL; GRID->nprocs = nprocs;
+   GRID->local_myrow = local_myrow; GRID->local_mycol = local_mycol;
+   GRID->local_nprow = p; GRID->local_npcol = q;
 /*
  * row_ip2   : largest power of two <= nprow;
  * row_hdim  : row_ip2 procs hypercube dim;
diff --git a/src/pgesv/HPL_pdgesvK2_HIP.cpp b/src/pgesv/HPL_pdgesvK2_HIP.cpp
index 3632fc1..ef74cdd 100644
--- a/src/pgesv/HPL_pdgesvK2_HIP.cpp
+++ b/src/pgesv/HPL_pdgesvK2_HIP.cpp
@@ -286,7 +286,6 @@ void HPL_pdgesvK2_HIP
     /* if this is process 0,0 and not the first panel */
     if(GRID->myrow == 0 && mycol == 0 && j > 0) {
       time = HPL_ptimer_walltime() - start_time;
-      gflops = 2.0 * (N * (double)N * N - n * (double)n * n) / 3.0 / (time > 0.0 ? time : 1.e-6) / 1.e9;
       printf("Column=%09d (%4.1f%%) ", j, j * 100.0 / N);
       printf("Step Time(s)=%9.7f ", stepEnd-stepStart);
 
@@ -305,6 +304,7 @@ void HPL_pdgesvK2_HIP
       if (panel[0]->nu1) printf("DGEMM2 Gflops=%9.3e ", largeDgemm1Gflops);
       else printf("DGEMM2 Gflops=--------- ");
       
+      gflops = 2.0 * (A->n * (double)A->n * A->n - (A->n - j) * (double)(A->n - j) * (A->n - j)) / 3.0 / (time > 0.0 ? time : 1.e-6) / 1.e9;
       printf("Overall Gflops=%9.3e\n", gflops);
     }
 #endif
diff --git a/testing/backend/HPL_backendHIP.cpp b/testing/backend/HPL_backendHIP.cpp
index e51a57f..36b07ee 100644
--- a/testing/backend/HPL_backendHIP.cpp
+++ b/testing/backend/HPL_backendHIP.cpp
@@ -3,13 +3,16 @@
 
 
 
-void HIP::init(size_t num_gpus)
+void HIP::init(const HPL_T_grid* GRID)
 {
     int rank, size, count, namelen; 
     size_t bytes;
     char (*host_names)[MPI_MAX_PROCESSOR_NAME];
+    int nprow, npcol, myrow, mycol;
 
 
+    (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol);
+
     MPI_Comm_rank( MPI_COMM_WORLD, &rank );
     MPI_Comm_size( MPI_COMM_WORLD, &size );
     MPI_Get_processor_name(host_name,&namelen);
@@ -23,14 +26,9 @@ void HIP::init(size_t num_gpus)
     for (int n=0; n < size; n++){
         MPI_Bcast(&(host_names[n]),MPI_MAX_PROCESSOR_NAME, MPI_CHAR, n, MPI_COMM_WORLD);
     }
-    int localRank = 0;
-    for (int n = 0; n < rank; n++){
-        if (!strcmp(host_name, host_names[n])) localRank++;
-    }
-    int localSize = 0;
-    for (int n = 0; n < size; n++){
-        if (!strcmp(host_name, host_names[n])) localSize++;
-    }
+
+    int localRank = GRID->local_mycol + GRID->local_myrow * GRID->local_npcol;
+    int localSize = GRID->local_npcol * GRID->local_nprow;
 
 
     hipDeviceProp_t hipDeviceProp;
@@ -169,508 +167,122 @@ void HIP::panel_new(HPL_T_grid *GRID, HPL_T_palg *ALGO, const int M, const int N
 }
 
 
-void HIP::panel_init(HPL_T_grid *GRID, HPL_T_palg *ALGO, const int M, const int N, const int JB,
-                     HPL_T_pmat *A, const int IA, const int JA, const int TAG, HPL_T_panel *PANEL)
+
+
+void HIP::panel_send_to_host(HPL_T_panel *PANEL)
 {
-  size_t dalign;
-  int icurcol, icurrow, ii, itmp1, jj, lwork,
-      ml2, mp, mycol, myrow, nb, npcol, nprow,
-      nq, nu, ldu;
-  /* ..
-   * .. Executable Statements ..
-   */
-  PANEL->grid = GRID; /* ptr to the process grid */
-  PANEL->algo = ALGO; /* ptr to the algo parameters */
-  PANEL->pmat = A;    /* ptr to the local array info */
+    int jb = PANEL->jb;
 
-  myrow = GRID->myrow;
-  mycol = GRID->mycol;
-  nprow = GRID->nprow;
-  npcol = GRID->npcol;
-  nb = A->nb;
+    if( ( PANEL->grid->mycol != PANEL->pcol ) || ( jb <= 0 ) ) return;
+    if(PANEL->mp > 0)
+    HIP_CHECK_ERROR(hipMemcpy2DAsync(PANEL->A,  PANEL->lda*sizeof(double),
+                    PANEL->dA, PANEL->dlda*sizeof(double),
+                    PANEL->mp*sizeof(double), jb,
+                    hipMemcpyDeviceToHost, dataStream));
+    HIP_CHECK_ERROR(hipEventRecord(panelCopy, dataStream));
+}
 
-  HPL_infog2l(IA, JA, nb, nb, nb, nb, 0, 0, myrow, mycol,
-              nprow, npcol, &ii, &jj, &icurrow, &icurcol);
-  mp = HPL_numrocI(M, IA, nb, nb, myrow, 0, nprow);
-  nq = HPL_numrocI(N, JA, nb, nb, mycol, 0, npcol);
+// Only for P=1
+void HPL_unroll_ipiv(const int mp, const int jb, int* ipiv, int* ipiv_ex, int* upiv)
+{
+  for(int i = 0; i < mp; i++) { upiv[i] = i; } // initialize ids for the swapping
+  for(int i = 0; i < jb; i++) {                // swap ids
+    int id = upiv[i];
+    upiv[i] = upiv[ipiv[i]];
+    upiv[ipiv[i]] = id;
+  }
 
-  const int inxtcol = MModAdd1(icurcol, npcol);
-  const int inxtrow = MModAdd1(icurrow, nprow);
+  for(int i = 0; i < jb; i++) { ipiv_ex[i] = -1; }
 
-  /* ptr to trailing part of A */
-  PANEL->A = A->A;
-  PANEL->dA = Mptr((double *)(A->d_A), ii, jj, A->ld);
+  int cnt = 0;
+  for(int i = jb; i < mp; i++) { // find swapped ids outside of panel
+    if(upiv[i] < jb) { ipiv_ex[upiv[i]] = i; }
+  }
+}
 
-  /*
-   * Workspace pointers are initialized to NULL.
-   */
-  PANEL->L2 = nullptr;
-  PANEL->dL2 = nullptr;
-  PANEL->L1 = nullptr;
-  PANEL->dL1 = nullptr;
-  PANEL->DINFO = nullptr;
-  PANEL->U = nullptr;
-  PANEL->dU = nullptr;
-  PANEL->W = nullptr;
-  PANEL->dW = nullptr;
-  PANEL->U1 = nullptr;
-  PANEL->dU1 = nullptr;
-  PANEL->W1 = nullptr;
-  PANEL->dW1 = nullptr;
-  PANEL->U2 = nullptr;
-  PANEL->dU2 = nullptr;
-  PANEL->W2 = nullptr;
-  PANEL->dW2 = nullptr;
-  /*
-   * Local lengths, indexes process coordinates
-   */
-  PANEL->nb = nb;           /* distribution blocking factor */
-  PANEL->jb = JB;           /* panel width */
-  PANEL->m = M;             /* global # of rows of trailing part of A */
-  PANEL->n = N;             /* global # of cols of trailing part of A */
-  PANEL->ia = IA;           /* global row index of trailing part of A */
-  PANEL->ja = JA;           /* global col index of trailing part of A */
-  PANEL->mp = mp;           /* local # of rows of trailing part of A */
-  PANEL->nq = nq;           /* local # of cols of trailing part of A */
-  PANEL->ii = ii;           /* local row index of trailing part of A */
-  PANEL->jj = jj;           /* local col index of trailing part of A */
-  PANEL->lda = Mmax(1, mp); /* local leading dim of array A */
-  PANEL->dlda = A->ld;      /* local leading dim of array A */
-  PANEL->prow = icurrow;    /* proc row owning 1st row of trailing A */
-  PANEL->pcol = icurcol;    /* proc col owning 1st col of trailing A */
-  PANEL->msgid = TAG;       /* message id to be used for panel bcast */
-  /*
-   * Initialize  ldl2 and len to temporary dummy values and Update tag for
-   * next panel
-   */
-  PANEL->ldl2 = 0;  /* local leading dim of array L2 */
-  PANEL->dldl2 = 0; /* local leading dim of array L2 */
-  PANEL->len = 0;   /* length of the buffer to broadcast */
-  PANEL->nu0 = 0;
-  PANEL->nu1 = 0;
-  PANEL->nu2 = 0;
-  PANEL->ldu0 = 0;
-  PANEL->ldu1 = 0;
-  PANEL->ldu2 = 0;
+void HIP::panel_send_to_device(HPL_T_panel *PANEL)
+{
+  double *A, *dA;
+  int jb, i, ml2;
 
-  /*Split fraction*/
-  const double fraction = 0.7;
+  jb = PANEL->jb;
 
-  dalign = ALGO->align * sizeof(double);
-  size_t lpiv = (5 * JB * sizeof(int) + sizeof(double) - 1) / (sizeof(double));
+  if (jb <= 0)
+     return;
 
-  if (npcol == 1) /* P x 1 process grid */
-  {               /* space for L1, DPIV, DINFO */
-    lwork = ALGO->align + (PANEL->len = JB * JB + lpiv) + 1;
-    nu = Mmax(0, nq - JB);
-    ldu = nu + 256; /*extra space for padding*/
-    lwork += JB * ldu;
+  // only the root column copies to device
+  if (PANEL->grid->mycol == PANEL->pcol) {
 
-    if (PANEL->max_work_size < (size_t)(lwork) * sizeof(double))
-    {
-      if (PANEL->WORK)
-      {
-        HIP_CHECK_ERROR(hipFree(PANEL->dWORK));
-        HIP_CHECK_ERROR(hipHostFree(PANEL->WORK));
-      }
-      size_t numbytes = (size_t)(lwork) * sizeof(double);
+    if(PANEL->grid->nprow == 1) {
 
-      if (hipMalloc((void **)&(PANEL->dWORK), numbytes) != HIP_SUCCESS ||
-          hipHostMalloc((void **)&(PANEL->WORK), numbytes, hipHostMallocDefault) != HIP_SUCCESS)
-      {
-        HPL_pabort(__LINE__, "HPL_pdpanel_init", "Memory allocation failed");
-      }
-      PANEL->max_work_size = (size_t)(lwork) * sizeof(double);
+      // unroll pivoting and send to device now
+      int* ipiv    = PANEL->ipiv;
+      int* ipiv_ex = PANEL->ipiv + jb;
+      int* upiv    = PANEL->IWORK + jb; // scratch space
 
-#ifdef HPL_VERBOSE_PRINT
-      if ((myrow == 0) && (mycol == 0))
-      {
-        printf("Allocating %g GBs of storage on CPU...",
-               ((double)numbytes) / (1024 * 1024 * 1024));
-        fflush(stdout);
+      for(i = 0; i < jb; i++) { ipiv[i] -= PANEL->ii; } // shift
+      HPL_unroll_ipiv(PANEL->mp, jb, ipiv, ipiv_ex, upiv);
 
-        printf("done.\n");
-        printf("Allocating %g GBs of storage on GPU...",
-               ((double)numbytes) / (1024 * 1024 * 1024));
-        fflush(stdout);
-        printf("done.\n");
-      }
-#endif
-    }
-    /*
-     * Initialize the pointers of the panel structure  -  Always re-use A in
-     * the only process column
-     */
-    PANEL->ldl2 = Mmax(1, mp);
-    PANEL->dldl2 = A->ld;
-    PANEL->dL2 = PANEL->dA + (myrow == icurrow ? JB : 0);
-    PANEL->L2 = PANEL->A + (myrow == icurrow ? JB : 0);
-    PANEL->U = (double *)PANEL->WORK;
-    PANEL->dU = (double *)PANEL->dWORK;
-    PANEL->L1 = (double *)PANEL->WORK + (JB * Mmax(0, ldu));
-    PANEL->dL1 = (double *)PANEL->dWORK + (JB * Mmax(0, ldu));
-    PANEL->W = A->W;
-    PANEL->dW = A->dW;
+      int* dipiv    = PANEL->dipiv;
+      int* dipiv_ex = PANEL->dipiv + jb;
 
-    if (nprow == 1)
-    {
-      PANEL->nu0 = Mmin(JB, nu);
-      PANEL->ldu0 = PANEL->nu0;
+      HIP_CHECK_ERROR(hipMemcpy2DAsync(dipiv, jb * sizeof(int),
+                       upiv, jb * sizeof(int),
+                       jb * sizeof(int), 1,
+                       hipMemcpyHostToDevice, dataStream));
+      HIP_CHECK_ERROR(hipMemcpy2DAsync(dipiv_ex, jb * sizeof(int),
+                       ipiv_ex, jb * sizeof(int),
+                       jb * sizeof(int), 1,
+                       hipMemcpyHostToDevice, dataStream));
+    } 
+    else {
+      int k;
+      int *iflag, *ipl, *ipID, *ipA, *iplen, *ipmap, *ipmapm1, *upiv, *iwork, 
+          *lindxU, *lindxA, *lindxAU, *permU, *permU_ex, *ipiv, 
+          *dlindxU, *dlindxA, *dlindxAU, *dpermU, *dpermU_ex, *dipiv;
 
-      PANEL->nu1 = 0;
-      PANEL->ldu1 = 0;
+      k = (int)((unsigned int)(jb) << 1);
+      iflag = PANEL->IWORK; ipl = iflag + 1; ipID = ipl + 1; ipA = ipID + ((unsigned int)(k) << 1); 
+      iplen = ipA + 1; ipmap = iplen + PANEL->grid->nprow + 1; ipmapm1 = ipmap + PANEL->grid->nprow; 
+      upiv = ipmapm1 + PANEL->grid->nprow; iwork = upiv + PANEL->mp;
 
-      PANEL->nu2 = nu - PANEL->nu0;
-      PANEL->ldu2 = ((PANEL->nu2 + 95) / 128) * 128 + 32; /*pad*/
+      lindxU = PANEL->lindxU; lindxA = PANEL->lindxA; lindxAU = PANEL->lindxAU; 
+      permU = PANEL->permU; permU_ex = permU + jb; ipiv = PANEL->ipiv;
+      dlindxU = PANEL->dlindxU; dlindxA = PANEL->dlindxA; dlindxAU = PANEL->dlindxAU; 
+      dpermU = PANEL->dpermU; dpermU_ex = dpermU + jb; dipiv = PANEL->dipiv;
 
-      PANEL->U1 = PANEL->U + PANEL->ldu0 * JB;
-      PANEL->dU1 = PANEL->dU + PANEL->ldu0 * JB;
-      PANEL->U2 = PANEL->U1 + PANEL->ldu1 * JB;
-      PANEL->dU2 = PANEL->dU1 + PANEL->ldu1 * JB;
+      if(*iflag == -1) { /* no index arrays have been computed so far */
+        HPL_pipid(PANEL, ipl, ipID);
+        HPL_plindx(PANEL, *ipl, ipID, ipA, lindxU, lindxAU, lindxA, iplen, permU, iwork);
+        *iflag = 1;
+      }
 
-      PANEL->permU = (int *)(PANEL->L1 + JB * JB);
-      PANEL->dpermU = (int *)(PANEL->dL1 + JB * JB);
-      PANEL->ipiv = PANEL->permU + JB;
-      PANEL->dipiv = PANEL->dpermU + JB;
+      int N = Mmax(*ipA, jb);
+      if(N > 0) {
+        HIP_CHECK_ERROR(hipMemcpy2DAsync(dlindxA, k * sizeof(int), lindxA, k * sizeof(int), N * sizeof(int), 1, hipMemcpyHostToDevice, dataStream));
+        HIP_CHECK_ERROR(hipMemcpy2DAsync(dlindxAU, k * sizeof(int), lindxAU, k * sizeof(int), N * sizeof(int), 1, hipMemcpyHostToDevice, dataStream));
+      }
 
-      PANEL->DINFO = (double *)(PANEL->ipiv + 2 * JB);
-      PANEL->dDINFO = (double *)(PANEL->dipiv + 2 * JB);
+      HIP_CHECK_ERROR(hipMemcpyAsync(dlindxU, lindxU, jb * sizeof(int), hipMemcpyHostToDevice, dataStream));
+
+      HIP_CHECK_ERROR(hipMemcpy2DAsync(dpermU, jb * sizeof(int), permU, jb * sizeof(int), jb * sizeof(int), 1, hipMemcpyHostToDevice, dataStream));
+      HIP_CHECK_ERROR(hipMemcpy2DAsync(dipiv, jb * sizeof(int), ipiv, jb * sizeof(int), jb * sizeof(int), 1, hipMemcpyHostToDevice, dataStream));
     }
-    else
-    {
-      const int NSplit = Mmax(0, ((((int)(A->nq * fraction)) / nb) * nb));
-      PANEL->nu0 = Mmin(JB, nu);
-      PANEL->ldu0 = PANEL->nu0;
+  }
 
-      PANEL->nu2 = Mmin(nu - PANEL->nu0, NSplit);
-      PANEL->ldu2 = ((PANEL->nu2 + 95) / 128) * 128 + 32; /*pad*/
+  //record when the swap data will arrive
+  HIP_CHECK_ERROR(hipEventRecord(swapDataTransfer, dataStream));
 
-      PANEL->nu1 = nu - PANEL->nu0 - PANEL->nu2;
-      PANEL->ldu1 = ((PANEL->nu1 + 95) / 128) * 128 + 32; /*pad*/
+  // copy A and/or L2
+  if(PANEL->grid->mycol == PANEL->pcol) {
+    // copy L1
+    HIP_CHECK_ERROR(hipMemcpy2DAsync(PANEL->dL1, jb * sizeof(double),
+                     PANEL->L1, jb * sizeof(double),
+                     jb * sizeof(double), jb,
+                     hipMemcpyHostToDevice, dataStream));
 
-      PANEL->U1 = PANEL->U + PANEL->ldu0 * JB;
-      PANEL->dU1 = PANEL->dU + PANEL->ldu0 * JB;
-      PANEL->U2 = PANEL->U1 + PANEL->ldu1 * JB;
-      PANEL->dU2 = PANEL->dU1 + PANEL->ldu1 * JB;
-
-      PANEL->W1 = PANEL->W + PANEL->ldu0 * JB;
-      PANEL->dW1 = PANEL->dW + PANEL->ldu0 * JB;
-      PANEL->W2 = PANEL->W1 + PANEL->ldu1 * JB;
-      PANEL->dW2 = PANEL->dW1 + PANEL->ldu1 * JB;
-
-      PANEL->lindxA = (int *)(PANEL->L1 + JB * JB);
-      PANEL->dlindxA = (int *)(PANEL->dL1 + JB * JB);
-      PANEL->lindxAU = PANEL->lindxA + JB;
-      PANEL->dlindxAU = PANEL->dlindxA + JB;
-      PANEL->lindxU = PANEL->lindxAU + JB;
-      PANEL->dlindxU = PANEL->dlindxAU + JB;
-      PANEL->permU = PANEL->lindxU + JB;
-      PANEL->dpermU = PANEL->dlindxU + JB;
-
-      // Put ipiv array at the end
-      PANEL->dipiv = PANEL->dpermU + JB;
-      PANEL->ipiv = PANEL->permU + JB;
-
-      PANEL->DINFO = ((double *)PANEL->lindxA) + lpiv;
-      PANEL->dDINFO = ((double *)PANEL->dlindxA) + lpiv;
-    }
-
-    *(PANEL->DINFO) = 0.0;
-  }
-  else  // for ncol != 1
-  { /* space for L2, L1, DPIV */
-    ml2 = (myrow == icurrow ? mp - JB : mp);
-    ml2 = Mmax(0, ml2);
-    ml2 = ((ml2 + 95) / 128) * 128 + 32; /*pad*/
-    itmp1 = JB * JB + lpiv;              // L1, integer arrays
-    PANEL->len = ml2 * JB + itmp1;
-
-    lwork = ALGO->align + PANEL->len + 1;
-
-    nu = Mmax(0, (mycol == icurcol ? nq - JB : nq));
-    ldu = nu + 256; /*extra space for potential padding*/
-
-    // if( nprow > 1 )                                 /* space for U */
-    {
-      lwork += JB * ldu;
-    }
-    if (PANEL->max_work_size < (size_t)(lwork) * sizeof(double))
-    {
-      if (PANEL->WORK)
-      {
-        HIP_CHECK_ERROR(hipFree(PANEL->dWORK));
-        HIP_CHECK_ERROR(hipHostFree(PANEL->WORK));
-      }
-      size_t numbytes = (size_t)(lwork) * sizeof(double);
-
-      if (hipMalloc((void **)&(PANEL->dWORK), numbytes) != HIP_SUCCESS ||
-          hipHostMalloc((void **)&(PANEL->WORK), numbytes, hipHostMallocDefault) != HIP_SUCCESS)
-      {
-        HPL_pabort(__LINE__, "HPL_pdpanel_init", "Memory allocation failed");
-      }
-      PANEL->max_work_size = (size_t)(lwork) * sizeof(double);
-#ifdef HPL_VERBOSE_PRINT
-      if ((myrow == 0) && (mycol == 0))
-      {
-        printf("Allocating %g GBs of storage on CPU...",
-               ((double)numbytes) / (1024 * 1024 * 1024));
-        fflush(stdout);
-        printf("done.\n");
-        printf("Allocating %g GBs of storage on GPU...",
-               ((double)numbytes) / (1024 * 1024 * 1024));
-        fflush(stdout);
-        printf("done.\n");
-      }
-#endif
-    }
-    /*
-     * Initialize the pointers of the panel structure - Re-use A in the cur-
-     * rent process column when HPL_COPY_L is not defined.
-     */
-    PANEL->U = (double *)PANEL->WORK;
-    PANEL->dU = (double *)PANEL->dWORK;
-
-    PANEL->W = A->W;
-    PANEL->dW = A->dW;
-
-    PANEL->L2 = (double *)PANEL->WORK + (JB * Mmax(0, ldu));
-    PANEL->dL2 = (double *)PANEL->dWORK + (JB * Mmax(0, ldu));
-    PANEL->L1 = PANEL->L2 + ml2 * JB;
-    PANEL->dL1 = PANEL->dL2 + ml2 * JB;
-    PANEL->ldl2 = Mmax(1, ml2);
-    PANEL->dldl2 = Mmax(1, ml2);
-
-    if (nprow == 1)
-    {
-      PANEL->nu0 = (mycol == inxtcol) ? Mmin(JB, nu) : 0;
-      PANEL->ldu0 = PANEL->nu0;
-
-      PANEL->nu1 = 0;
-      PANEL->ldu1 = 0;
-
-      PANEL->nu2 = nu - PANEL->nu0;
-      PANEL->ldu2 = ((PANEL->nu2 + 95) / 128) * 128 + 32; /*pad*/
-
-      PANEL->U1 = PANEL->U + PANEL->ldu0 * JB;
-      PANEL->dU1 = PANEL->dU + PANEL->ldu0 * JB;
-      PANEL->U2 = PANEL->U1 + PANEL->ldu1 * JB;
-      PANEL->dU2 = PANEL->dU1 + PANEL->ldu1 * JB;
-
-      PANEL->permU = (int *)(PANEL->L1 + JB * JB);
-      PANEL->dpermU = (int *)(PANEL->dL1 + JB * JB);
-      PANEL->ipiv = PANEL->permU + JB;
-      PANEL->dipiv = PANEL->dpermU + JB;
-
-      PANEL->DINFO = (double *)(PANEL->ipiv + 2 * JB);
-      PANEL->dDINFO = (double *)(PANEL->dipiv + 2 * JB);
-    }
-    else
-    {
-      const int NSplit = Mmax(0, ((((int)(A->nq * fraction)) / nb) * nb));
-      PANEL->nu0 = (mycol == inxtcol) ? Mmin(JB, nu) : 0;
-      PANEL->ldu0 = PANEL->nu0;
-
-      PANEL->nu2 = Mmin(nu - PANEL->nu0, NSplit);
-      PANEL->ldu2 = ((PANEL->nu2 + 95) / 128) * 128 + 32; /*pad*/
-
-      PANEL->nu1 = nu - PANEL->nu0 - PANEL->nu2;
-      PANEL->ldu1 = ((PANEL->nu1 + 95) / 128) * 128 + 32; /*pad*/
-
-      PANEL->U1 = PANEL->U + PANEL->ldu0 * JB;
-      PANEL->dU1 = PANEL->dU + PANEL->ldu0 * JB;
-      PANEL->U2 = PANEL->U1 + PANEL->ldu1 * JB;
-      PANEL->dU2 = PANEL->dU1 + PANEL->ldu1 * JB;
-
-      PANEL->W1 = PANEL->W + PANEL->ldu0 * JB;
-      PANEL->dW1 = PANEL->dW + PANEL->ldu0 * JB;
-      PANEL->W2 = PANEL->W1 + PANEL->ldu1 * JB;
-      PANEL->dW2 = PANEL->dW1 + PANEL->ldu1 * JB;
-
-      PANEL->lindxA = (int *)(PANEL->L1 + JB * JB);
-      PANEL->dlindxA = (int *)(PANEL->dL1 + JB * JB);
-      PANEL->lindxAU = PANEL->lindxA + JB;
-      PANEL->dlindxAU = PANEL->dlindxA + JB;
-      PANEL->lindxU = PANEL->lindxAU + JB;
-      PANEL->dlindxU = PANEL->dlindxAU + JB;
-      PANEL->permU = PANEL->lindxU + JB;
-      PANEL->dpermU = PANEL->dlindxU + JB;
-
-      // Put ipiv array at the end
-      PANEL->ipiv = PANEL->permU + JB;
-      PANEL->dipiv = PANEL->dpermU + JB;
-
-      PANEL->DINFO = ((double *)PANEL->lindxA) + lpiv;
-      PANEL->dDINFO = ((double *)PANEL->dlindxA) + lpiv;
-    }
-
-    *(PANEL->DINFO) = 0.0;
-  }
-
-  if (nprow == 1)
-  {
-    lwork = mp + JB;
-  }
-  else
-  {
-    itmp1 = (JB << 1);
-    lwork = nprow + 1;
-    itmp1 = Mmax(itmp1, lwork);
-    lwork = mp + 4 + (5 * JB) + (3 * nprow) + itmp1;
-  }
-
-  if (PANEL->max_iwork_size < (size_t)(lwork) * sizeof(int))
-  {
-    if (PANEL->IWORK)
-    {
-      std::free(PANEL->IWORK);
-    }
-    size_t numbytes = (size_t)(lwork) * sizeof(int);
-    PANEL->IWORK = (int *)std::malloc(numbytes);
-    if (PANEL->IWORK == NULL)
-    {
-      HPL_pabort(__LINE__, "HPL_pdpanel_init", "Panel Host Integer Memory allocation failed");
-    }
-    PANEL->max_iwork_size = (size_t)(lwork) * sizeof(int);
-  }
-  if (lwork)
-    *(PANEL->IWORK) = -1;
-
-  /* ensure the temp buffer in HPL_pdfact is allocated once*/
-  lwork = (size_t)(((4 + ((unsigned int)(JB) << 1)) << 1));
-  if (PANEL->max_fwork_size < (size_t)(lwork) * sizeof(double))
-  {
-    if (PANEL->fWORK)
-    {
-      HIP_CHECK_ERROR(hipHostFree(PANEL->fWORK));
-    }
-    size_t numbytes = (size_t)(lwork) * sizeof(double);
-
-    HIP_CHECK_ERROR(hipHostMalloc((void **)&PANEL->fWORK, numbytes));
-    if (PANEL->fWORK == NULL)
-    {
-      HPL_pabort(__LINE__, "HPL_pdpanel_init", "Panel Host pdfact Scratch Memory allocation failed");
-    }
-    PANEL->max_fwork_size = (size_t)(lwork) * sizeof(double);
-  }
-  /*
-   * End of HPL_pdpanel_init
-   */
-}
-
-void HIP::panel_send_to_host(HPL_T_panel *PANEL)
-{
-    int jb = PANEL->jb;
-
-    if( ( PANEL->grid->mycol != PANEL->pcol ) || ( jb <= 0 ) ) return;
-    if(PANEL->mp > 0)
-    HIP_CHECK_ERROR(hipMemcpy2DAsync(PANEL->A,  PANEL->lda*sizeof(double),
-                    PANEL->dA, PANEL->dlda*sizeof(double),
-                    PANEL->mp*sizeof(double), jb,
-                    hipMemcpyDeviceToHost, dataStream));
-    HIP_CHECK_ERROR(hipEventRecord(panelCopy, dataStream));
-}
-
-// Only for P=1
-void HPL_unroll_ipiv(const int mp, const int jb, int* ipiv, int* ipiv_ex, int* upiv)
-{
-  for(int i = 0; i < mp; i++) { upiv[i] = i; } // initialize ids for the swapping
-  for(int i = 0; i < jb; i++) {                // swap ids
-    int id = upiv[i];
-    upiv[i] = upiv[ipiv[i]];
-    upiv[ipiv[i]] = id;
-  }
-
-  for(int i = 0; i < jb; i++) { ipiv_ex[i] = -1; }
-
-  int cnt = 0;
-  for(int i = jb; i < mp; i++) { // find swapped ids outside of panel
-    if(upiv[i] < jb) { ipiv_ex[upiv[i]] = i; }
-  }
-}
-
-void HIP::panel_send_to_device(HPL_T_panel *PANEL)
-{
-  double *A, *dA;
-  int jb, i, ml2;
-
-  jb = PANEL->jb;
-
-  if (jb <= 0)
-     return;
-
-  // only the root column copies to device
-  if (PANEL->grid->mycol == PANEL->pcol) {
-
-    if(PANEL->grid->nprow == 1) {
-
-      // unroll pivoting and send to device now
-      int* ipiv    = PANEL->ipiv;
-      int* ipiv_ex = PANEL->ipiv + jb;
-      int* upiv    = PANEL->IWORK + jb; // scratch space
-
-      for(i = 0; i < jb; i++) { ipiv[i] -= PANEL->ii; } // shift
-      HPL_unroll_ipiv(PANEL->mp, jb, ipiv, ipiv_ex, upiv);
-
-      int* dipiv    = PANEL->dipiv;
-      int* dipiv_ex = PANEL->dipiv + jb;
-
-      HIP_CHECK_ERROR(hipMemcpy2DAsync(dipiv, jb * sizeof(int),
-                       upiv, jb * sizeof(int),
-                       jb * sizeof(int), 1,
-                       hipMemcpyHostToDevice, dataStream));
-      HIP_CHECK_ERROR(hipMemcpy2DAsync(dipiv_ex, jb * sizeof(int),
-                       ipiv_ex, jb * sizeof(int),
-                       jb * sizeof(int), 1,
-                       hipMemcpyHostToDevice, dataStream));
-    } 
-    else {
-      int k;
-      int *iflag, *ipl, *ipID, *ipA, *iplen, *ipmap, *ipmapm1, *upiv, *iwork, 
-          *lindxU, *lindxA, *lindxAU, *permU, *permU_ex, *ipiv, 
-          *dlindxU, *dlindxA, *dlindxAU, *dpermU, *dpermU_ex, *dipiv;
-
-      k = (int)((unsigned int)(jb) << 1);
-      iflag = PANEL->IWORK; ipl = iflag + 1; ipID = ipl + 1; ipA = ipID + ((unsigned int)(k) << 1); 
-      iplen = ipA + 1; ipmap = iplen + PANEL->grid->nprow + 1; ipmapm1 = ipmap + PANEL->grid->nprow; 
-      upiv = ipmapm1 + PANEL->grid->nprow; iwork = upiv + PANEL->mp;
-
-      lindxU = PANEL->lindxU; lindxA = PANEL->lindxA; lindxAU = PANEL->lindxAU; 
-      permU = PANEL->permU; permU_ex = permU + jb; ipiv = PANEL->ipiv;
-      dlindxU = PANEL->dlindxU; dlindxA = PANEL->dlindxA; dlindxAU = PANEL->dlindxAU; 
-      dpermU = PANEL->dpermU; dpermU_ex = dpermU + jb; dipiv = PANEL->dipiv;
-
-      if(*iflag == -1) { /* no index arrays have been computed so far */
-        HPL_pipid(PANEL, ipl, ipID);
-        HPL_plindx(PANEL, *ipl, ipID, ipA, lindxU, lindxAU, lindxA, iplen, permU, iwork);
-        *iflag = 1;
-      }
-
-      int N = Mmax(*ipA, jb);
-      if(N > 0) {
-        HIP_CHECK_ERROR(hipMemcpy2DAsync(dlindxA, k * sizeof(int), lindxA, k * sizeof(int), N * sizeof(int), 1, hipMemcpyHostToDevice, dataStream));
-        HIP_CHECK_ERROR(hipMemcpy2DAsync(dlindxAU, k * sizeof(int), lindxAU, k * sizeof(int), N * sizeof(int), 1, hipMemcpyHostToDevice, dataStream));
-      }
-
-      HIP_CHECK_ERROR(hipMemcpyAsync(dlindxU, lindxU, jb * sizeof(int), hipMemcpyHostToDevice, dataStream));
-
-      HIP_CHECK_ERROR(hipMemcpy2DAsync(dpermU, jb * sizeof(int), permU, jb * sizeof(int), jb * sizeof(int), 1, hipMemcpyHostToDevice, dataStream));
-      HIP_CHECK_ERROR(hipMemcpy2DAsync(dipiv, jb * sizeof(int), ipiv, jb * sizeof(int), jb * sizeof(int), 1, hipMemcpyHostToDevice, dataStream));
-    }
-  }
-
-  //record when the swap data will arrive
-  HIP_CHECK_ERROR(hipEventRecord(swapDataTransfer, dataStream));
-
-  // copy A and/or L2
-  if(PANEL->grid->mycol == PANEL->pcol) {
-    // copy L1
-    HIP_CHECK_ERROR(hipMemcpy2DAsync(PANEL->dL1, jb * sizeof(double),
-                     PANEL->L1, jb * sizeof(double),
-                     jb * sizeof(double), jb,
-                     hipMemcpyHostToDevice, dataStream));
-
-    //record when L1 will arrive
-    HIP_CHECK_ERROR(hipEventRecord(L1Transfer, dataStream));
+    //record when L1 will arrive
+    HIP_CHECK_ERROR(hipEventRecord(L1Transfer, dataStream));
 
     if(PANEL->grid->npcol > 1) { // L2 is its own array
       if(PANEL->grid->myrow == PANEL->prow) {
@@ -723,16 +335,6 @@ int HIP::panel_free(HPL_T_panel *PANEL)
   return HPL_SUCCESS;
 }
 
-int HIP::panel_disp(HPL_T_panel **PANEL)
-{
-    GPUInfo("%-40s \t%-5s", "[Deallocate]", "Panel structure", "HIP");
-    (*PANEL)->free_work_now = 1;
-    int err = HIP::panel_free(*PANEL);
-    if (*PANEL) free(*PANEL);
-    *PANEL = NULL;
-    return( err );
-}
-
 void HIP::gPrintMat(const int M, const int N, const int LDA, const double *A)
 {
     // Last row is the vector b
@@ -751,7 +353,7 @@ int HIP::pdmatgen(HPL_T_test* TEST, HPL_T_grid* GRID, HPL_T_palg* ALGO,  HPL_T_p
   int mycol, myrow, npcol, nprow, nq, info[3];
   (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol);
 
-  mat->n    = N; mat->nb   = NB; mat->info = 0;
+  mat->n    = N; mat->nb   = NB; mat->info = 0; mat->dN = N;
   mat->mp   = HPL_numroc(N, NB, NB, myrow, 0, nprow);
   nq        = HPL_numroc(N, NB, NB, mycol, 0, npcol);
   /*
@@ -764,16 +366,8 @@ int HIP::pdmatgen(HPL_T_test* TEST, HPL_T_grid* GRID, HPL_T_palg* ALGO,  HPL_T_p
    * Ensure that lda is a multiple of ALIGN and not a power of 2, and not
    * a multiple of 4096 bytes
    */
-  mat->ld = ((Mmax(1, mat->mp) - 1) / ALGO->align) * ALGO->align;
-  do {
-    ii  = (mat->ld += ALGO->align);
-    ip2 = 1;
-    while(ii > 1) {
-      ii >>= 1;
-      ip2 <<= 1;
-    }
-    im4096 = (mat->ld % 512 ) ? 0 : 1;
-  } while((mat->ld == ip2) || im4096);
+  mat->ld = Mmax(1, mat->mp);
+  mat->ld = ((mat->ld + 95) / 128) * 128 + 32; /*pad*/
 
   mat->nq = nq + 1;
 
@@ -861,8 +455,8 @@ int HIP::pdmatgen(HPL_T_test* TEST, HPL_T_grid* GRID, HPL_T_palg* ALGO,  HPL_T_p
   workspace_size  = Mmax((2 * Anp + nq) * sizeof(double), workspace_size);
 
   /*Scratch space for rows in pdlaswp (with extra space for padding) */
-  dworkspace_size = Mmax((nq+256) * mat->nb * sizeof(double), dworkspace_size);
-  workspace_size  = Mmax((nq+256) * mat->nb * sizeof(double), workspace_size);
+  dworkspace_size = Mmax((nq + mat->nb + 256) * mat->nb * sizeof(double), dworkspace_size);
+  workspace_size = Mmax((nq + mat->nb + 256) * mat->nb * sizeof(double), workspace_size);
 
 #ifdef HPL_VERBOSE_PRINT
   if((myrow == 0) && (mycol == 0)) {
@@ -919,6 +513,18 @@ void HIP::pdmatfree(HPL_T_pmat* mat) {
 
 }
 
+int HIP::panel_disp(HPL_T_panel **PANEL)
+{
+    GPUInfo("%-40s \t%-5s", "[Deallocate]", "Panel structure", "HIP");
+    (*PANEL)->pmat->n = (*PANEL)->pmat->dN;
+    (*PANEL)->free_work_now = 1;
+    int err = HIP::panel_free(*PANEL);
+    if (*PANEL) free(*PANEL);
+    *PANEL = NULL;
+    return( err );
+}
+
+
 void HIP::matgen(const HPL_T_grid *GRID, const int M, const int N,
                  const int NB, double *A, const int LDA,
                  const int ISEED)
@@ -1278,47 +884,176 @@ void HIP::move_data(double *DST, const double *SRC, const size_t SIZE, const int
     HIP_CHECK_ERROR(hipMemcpy(DST, SRC, SIZE, (hipMemcpyKind)KIND));
 }
 
-void HIP::move_data_2d(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, const int KIND)
-{
-    char title[25] = "[MOVE2D_"; strcat(title,_memcpyKind[KIND]); strcat(title,"]");
-    GPUInfo("%-25s %-12ld (B) \t%-5s", title, "Memory of size",  SIZE, "HIP");
-    HIP_CHECK_ERROR(hipMemcpy2D(dst, dpitch, src, spitch, width, height, (hipMemcpyKind)KIND));
-}
+void HIP::move_data_2d(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, const int KIND)
+{
+    char title[25] = "[MOVE2D_"; strcat(title,_memcpyKind[KIND]); strcat(title,"]");
+    GPUInfo("%-25s %-12ld (B) \t%-5s", title, "Memory of size",  SIZE, "HIP");
+    HIP_CHECK_ERROR(hipMemcpy2D(dst, dpitch, src, spitch, width, height, (hipMemcpyKind)KIND));
+}
+
+
+void HIP::device_sync() {
+    HIP_CHECK_ERROR(hipDeviceSynchronize());
+}
+
+int HIP::bcast_ibcst(HPL_T_panel* PANEL, int* IFLAG) {
+
+  double *L2ptr;
+#ifdef ROCM
+  L2ptr = PANEL->dL2;
+#else
+  L2ptr = PANEL->L2;
+#endif
+
+  if(PANEL == NULL) {
+    return HPL_SUCCESS;
+  }
+  if(PANEL->grid->npcol <= 1) {
+    return HPL_SUCCESS;
+  }
+
+  MPI_Comm comm  = PANEL->grid->row_comm;
+  int root  = PANEL->pcol;
+
+  if(PANEL->len <= 0) return HPL_SUCCESS;
+  int ierr = MPI_Ibcast(L2ptr, PANEL->len, MPI_DOUBLE, root, comm, &bcast_req);
+  *IFLAG = ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
+  return *IFLAG;
+}
+
+int HIP::bwait_ibcast(HPL_T_panel* PANEL) {
+  int ierr;
+  ierr = MPI_Wait(&bcast_req, MPI_STATUS_IGNORE);
+  return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
+}
+
+void HIP::HPL_pdlaswp_hip(HPL_T_panel* PANEL, const HPL_T_UPD UPD, const SWP_PHASE phase) {
+  double *U, *W;
+  double *dA, *dU, *dW;
+  int *ipID, *iplen, *ipcounts, *ipoffsets, *iwork, *lindxU = NULL, *lindxA = NULL, *lindxAU, *permU;
+  int *dlindxU = NULL, *dlindxA = NULL, *dlindxAU, *dpermU, *dpermU_ex;
+  int  icurrow, *iflag, *ipA, *ipl, jb, k, lda, myrow, n, nprow, LDU, LDW; 
+  MPI_Comm comm;
+
+  /*
+   * Retrieve parameters from the PANEL data structure
+   */
+  n = PANEL->n; jb = PANEL->jb;
+  nprow = PANEL->grid->nprow; myrow = PANEL->grid->myrow;
+  comm = PANEL->grid->col_comm; icurrow = PANEL->prow;
+  iflag = PANEL->IWORK;
+  dA = PANEL->dA; lda = PANEL->dlda;
+  PANEL->pmat->n = PANEL->dldl1;
+
+  // Quick return if we're 1xQ
+  if(phase != SWP_END && nprow == 1) return;
+
+  pdlaswp_set_var(PANEL, dU, U, LDU, dW, W, LDW, n, dA, UPD);
+
+  /* Quick return if there is nothing to do */
+  if((n <= 0) || (jb <= 0)) return;
+
+  // Quick swapping if P==1
+  if (phase == SWP_END && nprow == 1) {
+    // wait for swapping data to arrive
+    HPL_BE_stream_wait_event(HPL_COMPUTESTREAM, SWAPDATATRANSFER, HPL_TR);
 
+    HIP::HPL_dlaswp00N(jb, n, dA, lda, PANEL->dipiv);
+    return;
+  }
 
-void HIP::device_sync() {
-    HIP_CHECK_ERROR(hipDeviceSynchronize());
-}
+  /*
+   * Compute ipID (if not already done for this panel). lindxA and lindxAU
+   * are of length at most 2*jb - iplen is of size nprow+1, ipmap, ipmapm1
+   * are of size nprow,  permU is of length jb, and  this function needs a
+   * workspace of size max( 2 * jb (plindx1), nprow+1(equil)):
+   * 1(iflag) + 1(ipl) + 1(ipA) + 9*jb + 3*nprow + 1 + MAX(2*jb,nprow+1)
+   * i.e. 4 + 9*jb + 3*nprow + max(2*jb, nprow+1);
+   */
+  k = (int)((unsigned int)(jb) << 1);
+  ipl = iflag + 1;
+  ipID = ipl + 1;
+  ipA = ipID + ((unsigned int)(k) << 1);
+  iplen = ipA + 1;
+  ipcounts = iplen + nprow + 1;
+  ipoffsets = ipcounts + nprow;
+  iwork = ipoffsets + nprow;
 
-int HIP::bcast_ibcst(HPL_T_panel* PANEL, int* IFLAG) {
+  if (phase == SWP_START) {
+    if(*iflag == -1) {/* no index arrays have been computed so far */
+        // get the ipivs on the host after the Bcast
+        if(PANEL->grid->mycol != PANEL->pcol) {
+          HIP_CHECK_ERROR(hipMemcpy2DAsync(PANEL->ipiv, PANEL->jb * sizeof(int),
+                          PANEL->dipiv, PANEL->jb * sizeof(int),
+                          PANEL->jb * sizeof(int), 1,
+                          hipMemcpyDeviceToHost, HIP::dataStream));
+        }
+        HPL_BE_stream_synchronize(HPL_DATASTREAM, HPL_TR);
 
-  double *L2ptr;
-#ifdef ROCM
-  L2ptr = PANEL->dL2;
-#else
-  L2ptr = PANEL->L2;
-#endif
+        // compute spreading info
+        HPL_pipid(PANEL, ipl, ipID);
+        HPL_plindx(PANEL, *ipl, ipID, ipA, PANEL->lindxU, PANEL->lindxAU, PANEL->lindxA, iplen, PANEL->permU, iwork);
+        *iflag = 1;
+    }
 
-  if(PANEL == NULL) {
-    return HPL_SUCCESS;
-  }
-  if(PANEL->grid->npcol <= 1) {
-    return HPL_SUCCESS;
+      /*
+      * For i in [0..2*jb),  lindxA[i] is the offset in A of a row that ulti-
+      * mately goes to U( :, lindxAU[i] ).  In each rank, we directly pack
+      * into U, otherwise we pack into workspace. The  first
+      * entry of each column packed in workspace is in fact the row or column
+      * offset in U where it should go to.
+      */
+      if(myrow == icurrow) {
+        // copy needed rows of A into U
+        HIP::HPL_dlaswp01T(jb, n, dA, lda, dU, LDU, PANEL->dlindxU);
+        // record the evernt when packing completes
+        HIP::event_record(SWAPSTART, UPD);
+      } else {
+        // copy needed rows from A into U(:, iplen[myrow])
+        HIP::HPL_dlaswp03T(iplen[myrow + 1] - iplen[myrow], n, dA, lda, Mptr(dU, 0, iplen[myrow], LDU), LDU, PANEL->dlindxU);
+        // record the event when packing completes
+        HIP::event_record(SWAPSTART, UPD);
+      }
   }
+  else if (phase == SWP_COMM) {
+    /* Set MPI message counts and offsets */
+    ipcounts[0] = (iplen[1] - iplen[0]) * LDU;
+    ipoffsets[0] = 0;
+    PANEL->pmat->n = PANEL->dldl1;
+    // if (phase == SWP_END)
+    //   PANEL->pmat->n = PANEL->pmat->dN;
+    for(int i = 1; i < nprow; ++i) {
+      ipcounts[i] = (iplen[i + 1] - iplen[i]) * LDU;
+      ipoffsets[i] = ipcounts[i - 1] + ipoffsets[i - 1];
+    }
 
-  MPI_Comm comm  = PANEL->grid->row_comm;
-  int root  = PANEL->pcol;
-
-  if(PANEL->len <= 0) return HPL_SUCCESS;
-  int ierr = MPI_Ibcast(L2ptr, PANEL->len, MPI_DOUBLE, root, comm, &bcast_req);
-  *IFLAG = ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
-  return *IFLAG;
-}
+    if(myrow == icurrow) {
+      HIP::event_synchronize(SWAPSTART, UPD);
+      // Send rows info to other ranks
+      HPL_scatterv(dU, ipcounts, ipoffsets, ipcounts[myrow], icurrow, comm);
+      // All gather dU (gather + broadcast)
+      HPL_allgatherv(dU, ipcounts[myrow], ipcounts, ipoffsets, comm);
+    } else {
+      // Wait for dU to be ready
+      HIP::event_synchronize(SWAPSTART, UPD);
+      // Receive rows from icurrow into dW
+      HPL_scatterv(dW, ipcounts, ipoffsets, ipcounts[myrow], icurrow, comm);
+      // All gather dU
+      HPL_allgatherv(dU, ipcounts[myrow], ipcounts, ipoffsets, comm);
+    }
+  }
+  else if (phase == SWP_END) {
+    if(myrow == icurrow) {
+        // Swap rows local to A on device
+        HIP::HPL_dlaswp02T(*ipA, n, dA, lda, PANEL->dlindxAU, PANEL->dlindxA);
+    } else {
+        // Queue inserting recieved rows in W into A on device
+        HIP::HPL_dlaswp04T(iplen[myrow + 1] - iplen[myrow], n, dA, lda, dW, LDW, PANEL->dlindxU);
+    }
+    /* Permute U in every process row */
+    HIP::HPL_dlaswp10N(n, jb, dU, LDU, PANEL->dpermU);
 
-int HIP::bwait_ibcast(HPL_T_panel* PANEL) {
-  int ierr;
-  ierr = MPI_Wait(&bcast_req, MPI_STATUS_IGNORE);
-  return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
+  }
 }
 
 #define BLOCK_SIZE_PDLANGE 512
@@ -1558,409 +1293,681 @@ __global__ void _dlaswp00N(const int N, const int M, double* __restrict__ A, con
   }
 }
 
-// Row swapping for P==1
-void HIP::HPL_dlaswp00N(const int M, const int N, double* A, const int LDA, const int* IPIV) {
+// Row swapping for P==1
+void HIP::HPL_dlaswp00N(const int M, const int N, double* A, const int LDA, const int* IPIV) {
+
+  if((M <= 0) || (N <= 0)) return;
+
+  hipStream_t stream;
+  ROCBLAS_CHECK_STATUS(rocblas_get_stream(_handle, &stream));
+  int grid_size = N;
+  hipLaunchKernelGGL((_dlaswp00N), dim3(grid_size), dim3(BLOCK_SIZE_00N), 0, stream, N, M, A, LDA, IPIV);
+}
+
+
+#define TILE_DIM_01T 32
+#define BLOCK_ROWS_01T 8
+
+/* Build U matrix from rows of A */
+__global__ void _dlaswp01T(const int M, const int N, double* __restrict__ A, const int LDA, double* __restrict__ U, const int LDU, const int* __restrict__ LINDXU) {
+
+  __shared__ double s_U[TILE_DIM_01T][TILE_DIM_01T + 1];
+
+  const int m = threadIdx.x + TILE_DIM_01T * blockIdx.x;
+  const int n = threadIdx.y + TILE_DIM_01T * blockIdx.y;
+
+  if(m < M) {
+    const int ipa  = LINDXU[m];
+
+    // Save to LDS to reduce global memory operation
+    s_U[threadIdx.x][threadIdx.y + 0] =
+        (n + 0 < N) ? A[ipa + (n + 0) * ((size_t)LDA)] : 0.0;
+    s_U[threadIdx.x][threadIdx.y + 8] =
+        (n + 8 < N) ? A[ipa + (n + 8) * ((size_t)LDA)] : 0.0;
+    s_U[threadIdx.x][threadIdx.y + 16] =
+        (n + 16 < N) ? A[ipa + (n + 16) * ((size_t)LDA)] : 0.0;
+    s_U[threadIdx.x][threadIdx.y + 24] =
+        (n + 24 < N) ? A[ipa + (n + 24) * ((size_t)LDA)] : 0.0;
+  }
+
+  __syncthreads();
+
+  const int um = threadIdx.y + TILE_DIM_01T * blockIdx.x;
+  const int un = threadIdx.x + TILE_DIM_01T * blockIdx.y;
+
+  if(un < N) {
+    // write out chunks of U
+    if((um + 0) < M)
+      U[un + (um + 0) * ((size_t)LDU)] = s_U[threadIdx.y + 0][threadIdx.x];
+    if((um + 8) < M)
+      U[un + (um + 8) * ((size_t)LDU)] = s_U[threadIdx.y + 8][threadIdx.x];
+    if((um + 16) < M)
+      U[un + (um + 16) * ((size_t)LDU)] = s_U[threadIdx.y + 16][threadIdx.x];
+    if((um + 24) < M)
+      U[un + (um + 24) * ((size_t)LDU)] = s_U[threadIdx.y + 24][threadIdx.x];
+  }
+}
+
+void HIP::HPL_dlaswp01T(const int M, const int N, double* A, const int LDA, double* U, const int LDU, const int* LINDXU) {
+
+  if((M <= 0) || (N <= 0)) return;
+
+  hipStream_t stream;
+  ROCBLAS_CHECK_STATUS(rocblas_get_stream(_handle, &stream));
+  dim3 grid_size((M + TILE_DIM_01T - 1) / TILE_DIM_01T, (N + TILE_DIM_01T - 1) / TILE_DIM_01T);
+  dim3 block_size(TILE_DIM_01T, BLOCK_ROWS_01T);
+  hipLaunchKernelGGL((_dlaswp01T), grid_size, block_size, 0, stream, M, N, A, LDA, U, LDU, LINDXU);
+}
+
+/* Perform any local row swaps of A */
+__global__ void _dlaswp02T(const int M, const int N, double* __restrict__ A, const int LDA, const int* __restrict__ LINDXAU, const int* __restrict__ LINDXA) {
+
+  const int n = blockIdx.x, m = threadIdx.x;
+
+  const int srow = LINDXAU[m]; //src row
+  const int drow  = LINDXA[m];  //dst row
+
+  const double An = A[srow + n * ((size_t)LDA)];
+
+  __syncthreads();
+
+  A[drow + n * ((size_t)LDA)] = An;
+}
+
+void HIP::HPL_dlaswp02T(const int M, const int N, double* A, const int LDA, const int* LINDXAU, const int* LINDXA) {
+
+  if((M <= 0) || (N <= 0)) return;
+
+  hipStream_t stream;
+  ROCBLAS_CHECK_STATUS(rocblas_get_stream(_handle, &stream));
+  dim3 grid_size(N), block_size(M);
+  hipLaunchKernelGGL((_dlaswp02T), N, M, 0, stream, M, N, A, LDA, LINDXAU, LINDXA);
+}
+
+#define TILE_DIM_03T 32
+#define BLOCK_ROWS_03T 8
+
+/* Build W matrix from rows of A */
+__global__ void _dlaswp03T(const int M, const int N, double* __restrict__ A, const int LDA, double* __restrict__ W, const int LDW, const int* __restrict__ LINDXU) {
+
+  __shared__ double s_W[TILE_DIM_03T][TILE_DIM_03T + 1];
+
+  const int m = threadIdx.x + TILE_DIM_03T * blockIdx.x;
+  const int n = threadIdx.y + TILE_DIM_03T * blockIdx.y;
+
+  if(m < M) {
+    const int ipa = LINDXU[m];
+
+    // Save to LDS to reduce global memory operation
+    s_W[threadIdx.x][threadIdx.y + 0] =
+        (n + 0 < N) ? A[ipa + (n + 0) * ((size_t)LDA)] : 0.0;
+    s_W[threadIdx.x][threadIdx.y + 8] =
+        (n + 8 < N) ? A[ipa + (n + 8) * ((size_t)LDA)] : 0.0;
+    s_W[threadIdx.x][threadIdx.y + 16] =
+        (n + 16 < N) ? A[ipa + (n + 16) * ((size_t)LDA)] : 0.0;
+    s_W[threadIdx.x][threadIdx.y + 24] =
+        (n + 24 < N) ? A[ipa + (n + 24) * ((size_t)LDA)] : 0.0;
+  }
+
+  __syncthreads();
+
+  const int wm = threadIdx.y + TILE_DIM_03T * blockIdx.x;
+  const int wn = threadIdx.x + TILE_DIM_03T * blockIdx.y;
+
+  if(wn < N) {
+    // write out chunks of W
+    if((wm + 0) < M)
+      W[wn + (wm + 0) * ((size_t)LDW)] = s_W[threadIdx.y + 0][threadIdx.x];
+    if((wm + 8) < M)
+      W[wn + (wm + 8) * ((size_t)LDW)] = s_W[threadIdx.y + 8][threadIdx.x];
+    if((wm + 16) < M)
+      W[wn + (wm + 16) * ((size_t)LDW)] = s_W[threadIdx.y + 16][threadIdx.x];
+    if((wm + 24) < M)
+      W[wn + (wm + 24) * ((size_t)LDW)] = s_W[threadIdx.y + 24][threadIdx.x];
+  }
+}
+
+void HIP::HPL_dlaswp03T(const int M, const int N, double* A, const int LDA, double* W, const int LDW, const int* LINDXU) {
 
   if((M <= 0) || (N <= 0)) return;
-
   hipStream_t stream;
   ROCBLAS_CHECK_STATUS(rocblas_get_stream(_handle, &stream));
-  int grid_size = N;
-  hipLaunchKernelGGL((_dlaswp00N), dim3(grid_size), dim3(BLOCK_SIZE_00N), 0, stream, N, M, A, LDA, IPIV);
+  dim3 grid_size((M + TILE_DIM_03T - 1) / TILE_DIM_03T, (N + TILE_DIM_03T - 1) / TILE_DIM_03T);
+  dim3 block_size(TILE_DIM_03T, BLOCK_ROWS_03T);
+  hipLaunchKernelGGL((_dlaswp03T), grid_size, block_size, 0, stream, M, N, A, LDA, W, LDW, LINDXU);
 }
 
+#define TILE_DIM_04T 32
+#define BLOCK_ROWS_04T 8
 
-#define TILE_DIM_01T 32
-#define BLOCK_ROWS_01T 8
-
-/* Build U matrix from rows of A */
-__global__ void _dlaswp01T(const int M, const int N, double* __restrict__ A, const int LDA, double* __restrict__ U, const int LDU, const int* __restrict__ LINDXU) {
+static __global__ void _dlaswp04T(const int M, const int N, double* __restrict__ A, const int LDA, double* __restrict__ W, const int LDW, const int* __restrict__ LINDXU) {
 
-  __shared__ double s_U[TILE_DIM_01T][TILE_DIM_01T + 1];
+  __shared__ double s_W[TILE_DIM_04T][TILE_DIM_04T + 1];
 
-  const int m = threadIdx.x + TILE_DIM_01T * blockIdx.x;
-  const int n = threadIdx.y + TILE_DIM_01T * blockIdx.y;
+  const int am = threadIdx.x + TILE_DIM_04T * blockIdx.x;
+  const int an = threadIdx.y + TILE_DIM_04T * blockIdx.y;
 
-  if(m < M) {
-    const int ipa  = LINDXU[m];
+  const int wm = threadIdx.y + TILE_DIM_04T * blockIdx.x;
+  const int wn = threadIdx.x + TILE_DIM_04T * blockIdx.y;
 
-    // Save to LDS to reduce global memory operation
-    s_U[threadIdx.x][threadIdx.y + 0] =
-        (n + 0 < N) ? A[ipa + (n + 0) * ((size_t)LDA)] : 0.0;
-    s_U[threadIdx.x][threadIdx.y + 8] =
-        (n + 8 < N) ? A[ipa + (n + 8) * ((size_t)LDA)] : 0.0;
-    s_U[threadIdx.x][threadIdx.y + 16] =
-        (n + 16 < N) ? A[ipa + (n + 16) * ((size_t)LDA)] : 0.0;
-    s_U[threadIdx.x][threadIdx.y + 24] =
-        (n + 24 < N) ? A[ipa + (n + 24) * ((size_t)LDA)] : 0.0;
+  if(wn < N) {
+    s_W[threadIdx.y + 0][threadIdx.x] =
+        (wm + 0 < M) ? W[wn + (wm + 0) * ((size_t)LDW)] : 0.0;
+    s_W[threadIdx.y + 8][threadIdx.x] =
+        (wm + 8 < M) ? W[wn + (wm + 8) * ((size_t)LDW)] : 0.0;
+    s_W[threadIdx.y + 16][threadIdx.x] =
+        (wm + 16 < M) ? W[wn + (wm + 16) * ((size_t)LDW)] : 0.0;
+    s_W[threadIdx.y + 24][threadIdx.x] =
+        (wm + 24 < M) ? W[wn + (wm + 24) * ((size_t)LDW)] : 0.0;
   }
 
   __syncthreads();
 
-  const int um = threadIdx.y + TILE_DIM_01T * blockIdx.x;
-  const int un = threadIdx.x + TILE_DIM_01T * blockIdx.y;
-
-  if(un < N) {
-    // write out chunks of U
-    if((um + 0) < M)
-      U[un + (um + 0) * ((size_t)LDU)] = s_U[threadIdx.y + 0][threadIdx.x];
-    if((um + 8) < M)
-      U[un + (um + 8) * ((size_t)LDU)] = s_U[threadIdx.y + 8][threadIdx.x];
-    if((um + 16) < M)
-      U[un + (um + 16) * ((size_t)LDU)] = s_U[threadIdx.y + 16][threadIdx.x];
-    if((um + 24) < M)
-      U[un + (um + 24) * ((size_t)LDU)] = s_U[threadIdx.y + 24][threadIdx.x];
+  if(am < M) {
+    const int aip = LINDXU[am];
+    if((an + 0) < N)
+      A[aip + (an + 0) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 0];
+    if((an + 8) < N)
+      A[aip + (an + 8) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 8];
+    if((an + 16) < N)
+      A[aip + (an + 16) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 16];
+    if((an + 24) < N)
+      A[aip + (an + 24) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 24];
   }
 }
 
-void HIP::HPL_dlaswp01T(const int M, const int N, double* A, const int LDA, double* U, const int LDU, const int* LINDXU) {
+void HIP::HPL_dlaswp04T(const int M, const int N, double* A, const int LDA, double* W, const int LDW, const int* LINDXU) {
+  if((M <= 0) || (N <= 0)) return;
+  hipStream_t stream;
+  ROCBLAS_CHECK_STATUS(rocblas_get_stream(_handle, &stream)); 
+  dim3 grid_size((M + TILE_DIM_04T - 1) / TILE_DIM_04T, (N + TILE_DIM_04T - 1) / TILE_DIM_04T);
+  dim3 block_size(TILE_DIM_04T, BLOCK_ROWS_04T);
+  hipLaunchKernelGGL((_dlaswp04T), grid_size, block_size, 0, stream, M, N, A, LDA, W, LDW, LINDXU);
+}
+
+__global__ void _dlaswp10N(const int M, const int N, double* __restrict__ A, const int LDA, const int* __restrict__ IPIV) {
+
+  const int m = threadIdx.x + blockDim.x * blockIdx.x;
+
+  if (m < M) { 
+    for (int i = 0; i < N; i++) {
+      const int ip = IPIV[i];
+      if (ip != i) {
+        // swap rows
+        const double Ai = A[m + i * ((size_t)LDA)];
+        const double Aip = A[m + ip * ((size_t)LDA)];
+        A[m + i * ((size_t)LDA)] = Aip;
+        A[m + ip * ((size_t)LDA)] = Ai;
+      }
+    }
+  }
+}
 
+void HIP::HPL_dlaswp10N(const int M, const int N, double* A, const int LDA, const int* IPIV) {
   if((M <= 0) || (N <= 0)) return;
 
   hipStream_t stream;
   ROCBLAS_CHECK_STATUS(rocblas_get_stream(_handle, &stream));
-  dim3 grid_size((M + TILE_DIM_01T - 1) / TILE_DIM_01T, (N + TILE_DIM_01T - 1) / TILE_DIM_01T);
-  dim3 block_size(TILE_DIM_01T, BLOCK_ROWS_01T);
-  hipLaunchKernelGGL((_dlaswp01T), grid_size, block_size, 0, stream, M, N, A, LDA, U, LDU, LINDXU);
+
+  const int block_size_10N = 512;
+
+  dim3 grid_size((M + block_size_10N - 1) / block_size_10N);
+  hipLaunchKernelGGL((_dlaswp10N), grid_size, dim3(block_size_10N), 0, stream, M, N, A, LDA, IPIV);
 }
 
-/* Perform any local row swaps of A */
-__global__ void _dlaswp02T(const int M, const int N, double* __restrict__ A, const int LDA, const int* __restrict__ LINDXAU, const int* __restrict__ LINDXA) {
+__global__ void setZero(const int N, double* __restrict__ X) {
+  const int t = threadIdx.x, b = blockIdx.x;
+  const size_t id = b * blockDim.x + t; // row id
 
-  const int n = blockIdx.x, m = threadIdx.x;
+  if(id < N)
+    X[id] = 0.0;
+}
 
-  const int srow = LINDXAU[m]; //src row
-  const int drow  = LINDXA[m];  //dst row
+void HIP::HPL_set_zero(const int N, double* __restrict__ X) {
+    const int block_size = 512;
+    hipLaunchKernelGGL((setZero), dim3((N + block_size - 1) / block_size), dim3(block_size), 0, HIP::computeStream, N, X);
+}
 
-  const double An = A[srow + n * ((size_t)LDA)];
 
-  __syncthreads();
 
-  A[drow + n * ((size_t)LDA)] = An;
+// Setting the matrix section and phase of pdupdate
+void HIP::HPL_pdlaswp_hip(HPL_T_panel* PANEL, int icurcol, std::list<PDLASWP_OP> op_list) {
+  HPL_T_UPD UPD;
+  SWP_PHASE phase;
+  for (auto it = op_list.begin(); it != op_list.end(); ++it) {
+    const PDLASWP_OP op = *it;
+    if (op == SU0 || op == SU1 || op == SU2)  phase = SWP_START;
+    else if (op == CU0 || op == CU1 || op == CU2) phase = SWP_COMM;
+    else if (op == EU0 || op == EU1 || op == EU2) phase = SWP_END;
+    else  phase = SWP_NO;
+    
+    if (op == SU0 || op == CU0 || op == EU0)  UPD = HPL_LOOK_AHEAD;
+    else if (op == SU1 || op == CU1 || op == EU1) UPD = HPL_UPD_1;
+    else if (op == SU2 || op == CU2 || op == EU2) UPD = HPL_UPD_2;
+    else UPD = HPL_N_UPD;
+
+    if (UPD == HPL_LOOK_AHEAD && PANEL->grid->mycol != icurcol)
+      continue;
+    else
+      HPL_pdlaswp_hip(PANEL, UPD, phase);
+  }
 }
 
-void HIP::HPL_dlaswp02T(const int M, const int N, double* A, const int LDA, const int* LINDXAU, const int* LINDXA) {
+void HIP::pdlaswp_set_var(HPL_T_panel* PANEL, double* &dU, double* &U, int &ldu, double* &dW, double* &W, int &ldw, int &n, double* &dA, const HPL_T_UPD UPD) {
+  switch (UPD) {
+  case HPL_LOOK_AHEAD:
+    dU = PANEL->dU; U = PANEL->U; ldu = PANEL->ldu0;
+    dW = PANEL->dW; W = PANEL->W; ldw = PANEL->ldu0;
+    n = PANEL->nu0;
+    break;
+  case HPL_UPD_1:
+    dU = PANEL->dU1;  U = PANEL->U1;  ldu = PANEL->ldu1;
+    dW = PANEL->dW1;  W = PANEL->W1;  ldw = PANEL->ldu1;
+    n = PANEL->nu1;
+    dA = Mptr(dA, 0, PANEL->nu0, PANEL->dlda);
+    break;
+  case HPL_UPD_2:
+    dU = PANEL->dU2;  U = PANEL->U2;  ldu = PANEL->ldu2;
+    dW = PANEL->dW2;  W = PANEL->W2;  ldw = PANEL->ldu2;
+    n = PANEL->nu2;
+    dA = Mptr(dA, 0, PANEL->nu0 + PANEL->nu1, PANEL->dlda);
+    break;
+  default:
+    break;
+  }
+}
 
-  if((M <= 0) || (N <= 0)) return;
+void HIP::panel_init(HPL_T_grid *GRID, HPL_T_palg *ALGO, const int M, const int N, const int JB,
+                     HPL_T_pmat *A, const int IA, const int JA, const int TAG, HPL_T_panel *PANEL)
+{
+  size_t dalign;
+  int icurcol, icurrow, ii, itmp1, jj, lwork,
+      ml2, mp, mycol, myrow, nb, npcol, nprow,
+      nq, nu, ldu;
+  /* ..
+   * .. Executable Statements ..
+   */
+  PANEL->grid = GRID; /* ptr to the process grid */
+  PANEL->algo = ALGO; /* ptr to the algo parameters */
+  PANEL->pmat = A;    /* ptr to the local array info */
 
-  hipStream_t stream;
-  ROCBLAS_CHECK_STATUS(rocblas_get_stream(_handle, &stream));
-  dim3 grid_size(N), block_size(M);
-  hipLaunchKernelGGL((_dlaswp02T), N, M, 0, stream, M, N, A, LDA, LINDXAU, LINDXA);
-}
+  myrow = GRID->myrow;
+  mycol = GRID->mycol;
+  nprow = GRID->nprow;
+  npcol = GRID->npcol;
+  nb = A->nb;
+
+  HPL_infog2l(IA, JA, nb, nb, nb, nb, 0, 0, myrow, mycol,
+              nprow, npcol, &ii, &jj, &icurrow, &icurcol);
+  mp = HPL_numrocI(M, IA, nb, nb, myrow, 0, nprow);
+  nq = HPL_numrocI(N, JA, nb, nb, mycol, 0, npcol);
+
+  const int inxtcol = MModAdd1(icurcol, npcol);
+  const int inxtrow = MModAdd1(icurrow, nprow);
+
+  /* ptr to trailing part of A */
+  PANEL->A = A->A;
+  PANEL->dA = Mptr((double *)(A->d_A), ii, jj, A->ld);
+
+  /*
+   * Workspace pointers are initialized to NULL.
+   */
+  PANEL->L2 = nullptr;
+  PANEL->dL2 = nullptr;
+  PANEL->L1 = nullptr;
+  PANEL->dL1 = nullptr;
+  PANEL->DINFO = nullptr;
+  PANEL->U = nullptr;
+  PANEL->dU = nullptr;
+  PANEL->W = nullptr;
+  PANEL->dW = nullptr;
+  PANEL->U1 = nullptr;
+  PANEL->dU1 = nullptr;
+  PANEL->W1 = nullptr;
+  PANEL->dW1 = nullptr;
+  PANEL->U2 = nullptr;
+  PANEL->dU2 = nullptr;
+  PANEL->W2 = nullptr;
+  PANEL->dW2 = nullptr;
+  /*
+   * Local lengths, indexes process coordinates
+   */
+  PANEL->nb = nb;           /* distribution blocking factor */
+  PANEL->jb = JB;           /* panel width */
+  PANEL->m = M;             /* global # of rows of trailing part of A */
+  PANEL->n = N;             /* global # of cols of trailing part of A */
+  PANEL->ia = IA;           /* global row index of trailing part of A */
+  PANEL->ja = JA;           /* global col index of trailing part of A */
+  PANEL->mp = mp;           /* local # of rows of trailing part of A */
+  PANEL->nq = nq;           /* local # of cols of trailing part of A */
+  PANEL->ii = ii;           /* local row index of trailing part of A */
+  PANEL->jj = jj;           /* local col index of trailing part of A */
+  PANEL->lda = Mmax(1, mp); /* local leading dim of array A */
+  PANEL->dlda = A->ld;      /* local leading dim of array A */
+  PANEL->prow = icurrow;    /* proc row owning 1st row of trailing A */
+  PANEL->pcol = icurcol;    /* proc col owning 1st col of trailing A */
+  PANEL->msgid = TAG;       /* message id to be used for panel bcast */
+  /*
+   * Initialize  ldl2 and len to temporary dummy values and Update tag for
+   * next panel
+   */
+  PANEL->ldl2 = 0;  /* local leading dim of array L2 */
+  PANEL->dldl2 = 0; /* local leading dim of array L2 */
+  PANEL->dldl1 = 1.02 * A->dN; // padding
+  PANEL->len = 0;   /* length of the buffer to broadcast */
+  PANEL->nu0 = 0;
+  PANEL->nu1 = 0;
+  PANEL->nu2 = 0;
+  PANEL->ldu0 = 0;
+  PANEL->ldu1 = 0;
+  PANEL->ldu2 = 0;
 
-#define TILE_DIM_03T 32
-#define BLOCK_ROWS_03T 8
+  
+  /*Split fraction*/
+  const double fraction = 0.6;
 
-/* Build W matrix from rows of A */
-__global__ void _dlaswp03T(const int M, const int N, double* __restrict__ A, const int LDA, double* __restrict__ W, const int LDW, const int* __restrict__ LINDXU) {
+  if ((double)M / A->dN > 0.97) {
+    HPL_ptimer_boot();
+    HPL_ptimer( 0 );
+  }
+  dalign = ALGO->align * sizeof(double);
+  size_t lpiv = (5 * JB * sizeof(int) + sizeof(double) - 1) / (sizeof(double));
 
-  __shared__ double s_W[TILE_DIM_03T][TILE_DIM_03T + 1];
+  if (npcol == 1) /* P x 1 process grid */
+  {               /* space for L1, DPIV, DINFO */
+    lwork = ALGO->align + (PANEL->len = JB * JB + lpiv) + 1;
+    nu = Mmax(0, nq - JB);
+    ldu = nu + 256; /*extra space for padding*/
+    lwork += JB * ldu;
 
-  const int m = threadIdx.x + TILE_DIM_03T * blockIdx.x;
-  const int n = threadIdx.y + TILE_DIM_03T * blockIdx.y;
+    if (PANEL->max_work_size < (size_t)(lwork) * sizeof(double))
+    {
+      if (PANEL->WORK)
+      {
+        HIP_CHECK_ERROR(hipFree(PANEL->dWORK));
+        HIP_CHECK_ERROR(hipHostFree(PANEL->WORK));
+      }
+      size_t numbytes = (size_t)(lwork) * sizeof(double);
 
-  if(m < M) {
-    const int ipa = LINDXU[m];
+      if (hipMalloc((void **)&(PANEL->dWORK), numbytes) != HIP_SUCCESS ||
+          hipHostMalloc((void **)&(PANEL->WORK), numbytes, hipHostMallocDefault) != HIP_SUCCESS)
+      {
+        HPL_pabort(__LINE__, "HPL_pdpanel_init", "Memory allocation failed");
+      }
+      PANEL->max_work_size = (size_t)(lwork) * sizeof(double);
 
-    // Save to LDS to reduce global memory operation
-    s_W[threadIdx.x][threadIdx.y + 0] =
-        (n + 0 < N) ? A[ipa + (n + 0) * ((size_t)LDA)] : 0.0;
-    s_W[threadIdx.x][threadIdx.y + 8] =
-        (n + 8 < N) ? A[ipa + (n + 8) * ((size_t)LDA)] : 0.0;
-    s_W[threadIdx.x][threadIdx.y + 16] =
-        (n + 16 < N) ? A[ipa + (n + 16) * ((size_t)LDA)] : 0.0;
-    s_W[threadIdx.x][threadIdx.y + 24] =
-        (n + 24 < N) ? A[ipa + (n + 24) * ((size_t)LDA)] : 0.0;
-  }
+#ifdef HPL_VERBOSE_PRINT
+      if ((myrow == 0) && (mycol == 0))
+      {
+        printf("Allocating %g GBs of storage on CPU...",
+               ((double)numbytes) / (1024 * 1024 * 1024));
+        fflush(stdout);
 
-  __syncthreads();
+        printf("done.\n");
+        printf("Allocating %g GBs of storage on GPU...",
+               ((double)numbytes) / (1024 * 1024 * 1024));
+        fflush(stdout);
+        printf("done.\n");
+      }
+#endif
+    }
+    /*
+     * Initialize the pointers of the panel structure  -  Always re-use A in
+     * the only process column
+     */
+    PANEL->ldl2 = Mmax(1, mp);
+    PANEL->dldl2 = A->ld;
+    PANEL->dL2 = PANEL->dA + (myrow == icurrow ? JB : 0);
+    PANEL->L2 = PANEL->A + (myrow == icurrow ? JB : 0);
+    PANEL->U = (double *)PANEL->WORK;
+    PANEL->dU = (double *)PANEL->dWORK;
+    PANEL->L1 = (double *)PANEL->WORK + (JB * Mmax(0, ldu));
+    PANEL->dL1 = (double *)PANEL->dWORK + (JB * Mmax(0, ldu));
+    PANEL->W = A->W;
+    PANEL->dW = A->dW;
 
-  const int wm = threadIdx.y + TILE_DIM_03T * blockIdx.x;
-  const int wn = threadIdx.x + TILE_DIM_03T * blockIdx.y;
+    if (nprow == 1)
+    {
+      PANEL->nu0 = Mmin(JB, nu);
+      PANEL->ldu0 = PANEL->nu0;
 
-  if(wn < N) {
-    // write out chunks of W
-    if((wm + 0) < M)
-      W[wn + (wm + 0) * ((size_t)LDW)] = s_W[threadIdx.y + 0][threadIdx.x];
-    if((wm + 8) < M)
-      W[wn + (wm + 8) * ((size_t)LDW)] = s_W[threadIdx.y + 8][threadIdx.x];
-    if((wm + 16) < M)
-      W[wn + (wm + 16) * ((size_t)LDW)] = s_W[threadIdx.y + 16][threadIdx.x];
-    if((wm + 24) < M)
-      W[wn + (wm + 24) * ((size_t)LDW)] = s_W[threadIdx.y + 24][threadIdx.x];
-  }
-}
+      PANEL->nu1 = 0;
+      PANEL->ldu1 = 0;
 
-void HIP::HPL_dlaswp03T(const int M, const int N, double* A, const int LDA, double* W, const int LDW, const int* LINDXU) {
+      PANEL->nu2 = nu - PANEL->nu0;
+      PANEL->ldu2 = ((PANEL->nu2 + 95) / 128) * 128 + 32; /*pad*/
 
-  if((M <= 0) || (N <= 0)) return;
-  hipStream_t stream;
-  ROCBLAS_CHECK_STATUS(rocblas_get_stream(_handle, &stream));
-  dim3 grid_size((M + TILE_DIM_03T - 1) / TILE_DIM_03T, (N + TILE_DIM_03T - 1) / TILE_DIM_03T);
-  dim3 block_size(TILE_DIM_03T, BLOCK_ROWS_03T);
-  hipLaunchKernelGGL((_dlaswp03T), grid_size, block_size, 0, stream, M, N, A, LDA, W, LDW, LINDXU);
-}
+      PANEL->U1 = PANEL->U + PANEL->ldu0 * JB;
+      PANEL->dU1 = PANEL->dU + PANEL->ldu0 * JB;
+      PANEL->U2 = PANEL->U1 + PANEL->ldu1 * JB;
+      PANEL->dU2 = PANEL->dU1 + PANEL->ldu1 * JB;
 
-#define TILE_DIM_04T 32
-#define BLOCK_ROWS_04T 8
+      PANEL->permU = (int *)(PANEL->L1 + JB * JB);
+      PANEL->dpermU = (int *)(PANEL->dL1 + JB * JB);
+      PANEL->ipiv = PANEL->permU + JB;
+      PANEL->dipiv = PANEL->dpermU + JB;
 
-static __global__ void _dlaswp04T(const int M, const int N, double* __restrict__ A, const int LDA, double* __restrict__ W, const int LDW, const int* __restrict__ LINDXU) {
+      PANEL->DINFO = (double *)(PANEL->ipiv + 2 * JB);
+      PANEL->dDINFO = (double *)(PANEL->dipiv + 2 * JB);
+    }
+    else
+    {
+      const int NSplit = Mmax(0, ((((int)(A->nq * fraction)) / nb) * nb));
+      PANEL->nu0 = Mmin(JB, nu);
+      PANEL->ldu0 = PANEL->nu0;
 
-  __shared__ double s_W[TILE_DIM_04T][TILE_DIM_04T + 1];
+      PANEL->nu2 = Mmin(nu - PANEL->nu0, NSplit);
+      PANEL->ldu2 = ((PANEL->nu2 + 95) / 128) * 128 + 32; /*pad*/
 
-  const int am = threadIdx.x + TILE_DIM_04T * blockIdx.x;
-  const int an = threadIdx.y + TILE_DIM_04T * blockIdx.y;
+      PANEL->nu1 = nu - PANEL->nu0 - PANEL->nu2;
+      PANEL->ldu1 = ((PANEL->nu1 + 95) / 128) * 128 + 32; /*pad*/
 
-  const int wm = threadIdx.y + TILE_DIM_04T * blockIdx.x;
-  const int wn = threadIdx.x + TILE_DIM_04T * blockIdx.y;
+      PANEL->U1 = PANEL->U + PANEL->ldu0 * JB;
+      PANEL->dU1 = PANEL->dU + PANEL->ldu0 * JB;
+      PANEL->U2 = PANEL->U1 + PANEL->ldu1 * JB;
+      PANEL->dU2 = PANEL->dU1 + PANEL->ldu1 * JB;
 
-  if(wn < N) {
-    s_W[threadIdx.y + 0][threadIdx.x] =
-        (wm + 0 < M) ? W[wn + (wm + 0) * ((size_t)LDW)] : 0.0;
-    s_W[threadIdx.y + 8][threadIdx.x] =
-        (wm + 8 < M) ? W[wn + (wm + 8) * ((size_t)LDW)] : 0.0;
-    s_W[threadIdx.y + 16][threadIdx.x] =
-        (wm + 16 < M) ? W[wn + (wm + 16) * ((size_t)LDW)] : 0.0;
-    s_W[threadIdx.y + 24][threadIdx.x] =
-        (wm + 24 < M) ? W[wn + (wm + 24) * ((size_t)LDW)] : 0.0;
-  }
+      PANEL->W1 = PANEL->W + PANEL->ldu0 * JB;
+      PANEL->dW1 = PANEL->dW + PANEL->ldu0 * JB;
+      PANEL->W2 = PANEL->W1 + PANEL->ldu1 * JB;
+      PANEL->dW2 = PANEL->dW1 + PANEL->ldu1 * JB;
 
-  __syncthreads();
+      PANEL->lindxA = (int *)(PANEL->L1 + JB * JB);
+      PANEL->dlindxA = (int *)(PANEL->dL1 + JB * JB);
+      PANEL->lindxAU = PANEL->lindxA + JB;
+      PANEL->dlindxAU = PANEL->dlindxA + JB;
+      PANEL->lindxU = PANEL->lindxAU + JB;
+      PANEL->dlindxU = PANEL->dlindxAU + JB;
+      PANEL->permU = PANEL->lindxU + JB;
+      PANEL->dpermU = PANEL->dlindxU + JB;
 
-  if(am < M) {
-    const int aip = LINDXU[am];
-    if((an + 0) < N)
-      A[aip + (an + 0) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 0];
-    if((an + 8) < N)
-      A[aip + (an + 8) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 8];
-    if((an + 16) < N)
-      A[aip + (an + 16) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 16];
-    if((an + 24) < N)
-      A[aip + (an + 24) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 24];
+      // Put ipiv array at the end
+      PANEL->dipiv = PANEL->dpermU + JB;
+      PANEL->ipiv = PANEL->permU + JB;
+
+      PANEL->DINFO = ((double *)PANEL->lindxA) + lpiv;
+      PANEL->dDINFO = ((double *)PANEL->dlindxA) + lpiv;
+    }
+
+    *(PANEL->DINFO) = 0.0;
   }
-}
+  else  // for ncol != 1
+  { /* space for L2, L1, DPIV */
+    ml2 = (myrow == icurrow ? mp - JB : mp);
+    ml2 = Mmax(0, ml2);
+    ml2 = ((ml2 + 95) / 128) * 128 + 32; /*pad*/
+    itmp1 = JB * JB + lpiv;              // L1, integer arrays
+    PANEL->len = ml2 * JB + itmp1;
 
-void HIP::HPL_dlaswp04T(const int M, const int N, double* A, const int LDA, double* W, const int LDW, const int* LINDXU) {
-  if((M <= 0) || (N <= 0)) return;
-  hipStream_t stream;
-  ROCBLAS_CHECK_STATUS(rocblas_get_stream(_handle, &stream)); 
-  dim3 grid_size((M + TILE_DIM_04T - 1) / TILE_DIM_04T, (N + TILE_DIM_04T - 1) / TILE_DIM_04T);
-  dim3 block_size(TILE_DIM_04T, BLOCK_ROWS_04T);
-  hipLaunchKernelGGL((_dlaswp04T), grid_size, block_size, 0, stream, M, N, A, LDA, W, LDW, LINDXU);
-}
+    lwork = ALGO->align + PANEL->len + 1;
 
-__global__ void _dlaswp10N(const int M, const int N, double* __restrict__ A, const int LDA, const int* __restrict__ IPIV) {
+    nu = Mmax(0, (mycol == icurcol ? nq - JB : nq));
+    ldu = nu + 256; /*extra space for potential padding*/
 
-  const int m = threadIdx.x + blockDim.x * blockIdx.x;
+    // if( nprow > 1 )                                 /* space for U */
+    {
+      lwork += JB * ldu;
+    }
+    if (PANEL->max_work_size < (size_t)(lwork) * sizeof(double))
+    {
+      if (PANEL->WORK)
+      {
+        HIP_CHECK_ERROR(hipFree(PANEL->dWORK));
+        HIP_CHECK_ERROR(hipHostFree(PANEL->WORK));
+      }
+      size_t numbytes = (size_t)(lwork) * sizeof(double);
 
-  if (m < M) { 
-    for (int i = 0; i < N; i++) {
-      const int ip = IPIV[i];
-      if (ip != i) {
-        // swap rows
-        const double Ai = A[m + i * ((size_t)LDA)];
-        const double Aip = A[m + ip * ((size_t)LDA)];
-        A[m + i * ((size_t)LDA)] = Aip;
-        A[m + ip * ((size_t)LDA)] = Ai;
+      if (hipMalloc((void **)&(PANEL->dWORK), numbytes) != HIP_SUCCESS ||
+          hipHostMalloc((void **)&(PANEL->WORK), numbytes, hipHostMallocDefault) != HIP_SUCCESS)
+      {
+        HPL_pabort(__LINE__, "HPL_pdpanel_init", "Memory allocation failed");
+      }
+      PANEL->max_work_size = (size_t)(lwork) * sizeof(double);
+#ifdef HPL_VERBOSE_PRINT
+      if ((myrow == 0) && (mycol == 0))
+      {
+        printf("Allocating %g GBs of storage on CPU...",
+               ((double)numbytes) / (1024 * 1024 * 1024));
+        fflush(stdout);
+        printf("done.\n");
+        printf("Allocating %g GBs of storage on GPU...",
+               ((double)numbytes) / (1024 * 1024 * 1024));
+        fflush(stdout);
+        printf("done.\n");
       }
+#endif
     }
-  }
-}
-
-void HIP::HPL_dlaswp10N(const int M, const int N, double* A, const int LDA, const int* IPIV) {
-  if((M <= 0) || (N <= 0)) return;
-
-  hipStream_t stream;
-  ROCBLAS_CHECK_STATUS(rocblas_get_stream(_handle, &stream));
+    /*
+     * Initialize the pointers of the panel structure - Re-use A in the cur-
+     * rent process column when HPL_COPY_L is not defined.
+     */
+    PANEL->U = (double *)PANEL->WORK;
+    PANEL->dU = (double *)PANEL->dWORK;
 
-  const int block_size_10N = 512;
+    PANEL->W = A->W;
+    PANEL->dW = A->dW;
 
-  dim3 grid_size((M + block_size_10N - 1) / block_size_10N);
-  hipLaunchKernelGGL((_dlaswp10N), grid_size, dim3(block_size_10N), 0, stream, M, N, A, LDA, IPIV);
-}
+    PANEL->L2 = (double *)PANEL->WORK + (JB * Mmax(0, ldu));
+    PANEL->dL2 = (double *)PANEL->dWORK + (JB * Mmax(0, ldu));
+    PANEL->L1 = PANEL->L2 + ml2 * JB;
+    PANEL->dL1 = PANEL->dL2 + ml2 * JB;
+    PANEL->ldl2 = Mmax(1, ml2);
+    PANEL->dldl2 = Mmax(1, ml2);
 
-__global__ void setZero(const int N, double* __restrict__ X) {
-  const int t = threadIdx.x, b = blockIdx.x;
-  const size_t id = b * blockDim.x + t; // row id
+    if (nprow == 1)
+    {
+      PANEL->nu0 = (mycol == inxtcol) ? Mmin(JB, nu) : 0;
+      PANEL->ldu0 = PANEL->nu0;
 
-  if(id < N)
-    X[id] = 0.0;
-}
+      PANEL->nu1 = 0;
+      PANEL->ldu1 = 0;
 
-void HIP::HPL_set_zero(const int N, double* __restrict__ X) {
-    const int block_size = 512;
-    hipLaunchKernelGGL((setZero), dim3((N + block_size - 1) / block_size), dim3(block_size), 0, HIP::computeStream, N, X);
-}
+      PANEL->nu2 = nu - PANEL->nu0;
+      PANEL->ldu2 = ((PANEL->nu2 + 95) / 128) * 128 + 32; /*pad*/
 
-void HIP::HPL_pdlaswp_hip(HPL_T_panel* PANEL, const HPL_T_UPD UPD, const SWP_PHASE phase) {
-  double *U, *W;
-  double *dA, *dU, *dW;
-  int *ipID, *iplen, *ipcounts, *ipoffsets, *iwork, *lindxU = NULL, *lindxA = NULL, *lindxAU, *permU;
-  int *dlindxU = NULL, *dlindxA = NULL, *dlindxAU, *dpermU, *dpermU_ex;
-  int  icurrow, *iflag, *ipA, *ipl, jb, k, lda, myrow, n, nprow, LDU, LDW; 
-  MPI_Comm comm;
+      PANEL->U1 = PANEL->U + PANEL->ldu0 * JB;
+      PANEL->dU1 = PANEL->dU + PANEL->ldu0 * JB;
+      PANEL->U2 = PANEL->U1 + PANEL->ldu1 * JB;
+      PANEL->dU2 = PANEL->dU1 + PANEL->ldu1 * JB;
 
-  /*
-   * Retrieve parameters from the PANEL data structure
-   */
-  n = PANEL->n; jb = PANEL->jb;
-  nprow = PANEL->grid->nprow; myrow = PANEL->grid->myrow;
-  comm = PANEL->grid->col_comm; icurrow = PANEL->prow;
-  iflag = PANEL->IWORK;
-  dA = PANEL->dA; lda = PANEL->dlda;
+      PANEL->permU = (int *)(PANEL->L1 + JB * JB);
+      PANEL->dpermU = (int *)(PANEL->dL1 + JB * JB);
+      PANEL->ipiv = PANEL->permU + JB;
+      PANEL->dipiv = PANEL->dpermU + JB;
 
-  // Quick return if we're 1xQ
-  if(phase != SWP_END && nprow == 1) return;
+      PANEL->DINFO = (double *)(PANEL->ipiv + 2 * JB);
+      PANEL->dDINFO = (double *)(PANEL->dipiv + 2 * JB);
+    }
+    else
+    {
+      const int NSplit = Mmax(0, ((((int)(A->nq * fraction)) / nb) * nb));
+      PANEL->nu0 = (mycol == inxtcol) ? Mmin(JB, nu) : 0;
+      PANEL->ldu0 = PANEL->nu0;
 
-  pdlaswp_set_var(PANEL, dU, U, LDU, dW, W, LDW, n, dA, UPD);
+      PANEL->nu2 = Mmin(nu - PANEL->nu0, NSplit);
+      PANEL->ldu2 = ((PANEL->nu2 + 95) / 128) * 128 + 32; /*pad*/
 
-  /* Quick return if there is nothing to do */
-  if((n <= 0) || (jb <= 0)) return;
+      PANEL->nu1 = nu - PANEL->nu0 - PANEL->nu2;
+      PANEL->ldu1 = ((PANEL->nu1 + 95) / 128) * 128 + 32; /*pad*/
 
-  // Quick swapping if P==1
-  if (phase == SWP_END && nprow == 1) {
-    // wait for swapping data to arrive
-    HPL_BE_stream_wait_event(HPL_COMPUTESTREAM, SWAPDATATRANSFER, HPL_TR);
+      PANEL->U1 = PANEL->U + PANEL->ldu0 * JB;
+      PANEL->dU1 = PANEL->dU + PANEL->ldu0 * JB;
+      PANEL->U2 = PANEL->U1 + PANEL->ldu1 * JB;
+      PANEL->dU2 = PANEL->dU1 + PANEL->ldu1 * JB;
 
-    HIP::HPL_dlaswp00N(jb, n, dA, lda, PANEL->dipiv);
-    return;
-  }
+      PANEL->W1 = PANEL->W + PANEL->ldu0 * JB;
+      PANEL->dW1 = PANEL->dW + PANEL->ldu0 * JB;
+      PANEL->W2 = PANEL->W1 + PANEL->ldu1 * JB;
+      PANEL->dW2 = PANEL->dW1 + PANEL->ldu1 * JB;
 
-  /*
-   * Compute ipID (if not already done for this panel). lindxA and lindxAU
-   * are of length at most 2*jb - iplen is of size nprow+1, ipmap, ipmapm1
-   * are of size nprow,  permU is of length jb, and  this function needs a
-   * workspace of size max( 2 * jb (plindx1), nprow+1(equil)):
-   * 1(iflag) + 1(ipl) + 1(ipA) + 9*jb + 3*nprow + 1 + MAX(2*jb,nprow+1)
-   * i.e. 4 + 9*jb + 3*nprow + max(2*jb, nprow+1);
-   */
-  k = (int)((unsigned int)(jb) << 1);
-  ipl = iflag + 1;
-  ipID = ipl + 1;
-  ipA = ipID + ((unsigned int)(k) << 1);
-  iplen = ipA + 1;
-  ipcounts = iplen + nprow + 1;
-  ipoffsets = ipcounts + nprow;
-  iwork = ipoffsets + nprow;
+      PANEL->lindxA = (int *)(PANEL->L1 + JB * JB);
+      PANEL->dlindxA = (int *)(PANEL->dL1 + JB * JB);
+      PANEL->lindxAU = PANEL->lindxA + JB;
+      PANEL->dlindxAU = PANEL->dlindxA + JB;
+      PANEL->lindxU = PANEL->lindxAU + JB;
+      PANEL->dlindxU = PANEL->dlindxAU + JB;
+      PANEL->permU = PANEL->lindxU + JB;
+      PANEL->dpermU = PANEL->dlindxU + JB;
 
-  if (phase == SWP_START) {
-    if(*iflag == -1) {/* no index arrays have been computed so far */
-        // get the ipivs on the host after the Bcast
-        if(PANEL->grid->mycol != PANEL->pcol) {
-          HIP_CHECK_ERROR(hipMemcpy2DAsync(PANEL->ipiv, PANEL->jb * sizeof(int),
-                          PANEL->dipiv, PANEL->jb * sizeof(int),
-                          PANEL->jb * sizeof(int), 1,
-                          hipMemcpyDeviceToHost, HIP::dataStream));
-        }
-        HPL_BE_stream_synchronize(HPL_DATASTREAM, HPL_TR);
+      // Put ipiv array at the end
+      PANEL->ipiv = PANEL->permU + JB;
+      PANEL->dipiv = PANEL->dpermU + JB;
 
-        // compute spreading info
-        HPL_pipid(PANEL, ipl, ipID);
-        HPL_plindx(PANEL, *ipl, ipID, ipA, PANEL->lindxU, PANEL->lindxAU, PANEL->lindxA, iplen, PANEL->permU, iwork);
-        *iflag = 1;
+      PANEL->DINFO = ((double *)PANEL->lindxA) + lpiv;
+      PANEL->dDINFO = ((double *)PANEL->dlindxA) + lpiv;
     }
 
-      /*
-      * For i in [0..2*jb),  lindxA[i] is the offset in A of a row that ulti-
-      * mately goes to U( :, lindxAU[i] ).  In each rank, we directly pack
-      * into U, otherwise we pack into workspace. The  first
-      * entry of each column packed in workspace is in fact the row or column
-      * offset in U where it should go to.
-      */
-      if(myrow == icurrow) {
-        // copy needed rows of A into U
-        HIP::HPL_dlaswp01T(jb, n, dA, lda, dU, LDU, PANEL->dlindxU);
-        // record the evernt when packing completes
-        HIP::event_record(SWAPSTART, UPD);
-      } else {
-        // copy needed rows from A into U(:, iplen[myrow])
-        HIP::HPL_dlaswp03T(iplen[myrow + 1] - iplen[myrow], n, dA, lda, Mptr(dU, 0, iplen[myrow], LDU), LDU, PANEL->dlindxU);
-        // record the event when packing completes
-        HIP::event_record(SWAPSTART, UPD);
-      }
+    *(PANEL->DINFO) = 0.0;
   }
-  else if (phase == SWP_COMM) {
-    /* Set MPI message counts and offsets */
-    ipcounts[0] = (iplen[1] - iplen[0]) * LDU;
-    ipoffsets[0] = 0;
 
-    for(int i = 1; i < nprow; ++i) {
-      ipcounts[i] = (iplen[i + 1] - iplen[i]) * LDU;
-      ipoffsets[i] = ipcounts[i - 1] + ipoffsets[i - 1];
-    }
+  if (nprow == 1)
+  {
+    lwork = mp + JB;
+  }
+  else
+  {
+    itmp1 = (JB << 1);
+    lwork = nprow + 1;
+    itmp1 = Mmax(itmp1, lwork);
+    lwork = mp + 4 + (5 * JB) + (3 * nprow) + itmp1;
+  }
 
-    if(myrow == icurrow) {
-      HIP::event_synchronize(SWAPSTART, UPD);
-      // Send rows info to other ranks
-      HPL_scatterv(dU, ipcounts, ipoffsets, ipcounts[myrow], icurrow, comm);
-      // All gather dU (gather + broadcast)
-      HPL_allgatherv(dU, ipcounts[myrow], ipcounts, ipoffsets, comm);
-    } else {
-      // Wait for dU to be ready
-      HIP::event_synchronize(SWAPSTART, UPD);
-      // Receive rows from icurrow into dW
-      HPL_scatterv(dW, ipcounts, ipoffsets, ipcounts[myrow], icurrow, comm);
-      // All gather dU
-      HPL_allgatherv(dU, ipcounts[myrow], ipcounts, ipoffsets, comm);
+  if (PANEL->max_iwork_size < (size_t)(lwork) * sizeof(int))
+  {
+    if (PANEL->IWORK)
+    {
+      std::free(PANEL->IWORK);
     }
-  }
-  else if (phase == SWP_END) {
-    if(myrow == icurrow) {
-        // Swap rows local to A on device
-        HIP::HPL_dlaswp02T(*ipA, n, dA, lda, PANEL->dlindxAU, PANEL->dlindxA);
-    } else {
-        // Queue inserting recieved rows in W into A on device
-        HIP::HPL_dlaswp04T(iplen[myrow + 1] - iplen[myrow], n, dA, lda, dW, LDW, PANEL->dlindxU);
+    size_t numbytes = (size_t)(lwork) * sizeof(int);
+    PANEL->IWORK = (int *)std::malloc(numbytes);
+    if (PANEL->IWORK == NULL)
+    {
+      HPL_pabort(__LINE__, "HPL_pdpanel_init", "Panel Host Integer Memory allocation failed");
     }
-    /* Permute U in every process row */
-    HIP::HPL_dlaswp10N(n, jb, dU, LDU, PANEL->dpermU);
+    PANEL->max_iwork_size = (size_t)(lwork) * sizeof(int);
   }
-}
+  if (lwork)
+    *(PANEL->IWORK) = -1;
 
-// Setting the matrix section and phase of pdupdate
-void HIP::HPL_pdlaswp_hip(HPL_T_panel* PANEL, int icurcol, std::list<PDLASWP_OP> op_list) {
-  HPL_T_UPD UPD;
-  SWP_PHASE phase;
-  for (auto it = op_list.begin(); it != op_list.end(); ++it) {
-    const PDLASWP_OP op = *it;
-    if (op == SU0 || op == SU1 || op == SU2)  phase = SWP_START;
-    else if (op == CU0 || op == CU1 || op == CU2) phase = SWP_COMM;
-    else if (op == EU0 || op == EU1 || op == EU2) phase = SWP_END;
-    else  phase = SWP_NO;
-    
-    if (op == SU0 || op == CU0 || op == EU0)  UPD = HPL_LOOK_AHEAD;
-    else if (op == SU1 || op == CU1 || op == EU1) UPD = HPL_UPD_1;
-    else if (op == SU2 || op == CU2 || op == EU2) UPD = HPL_UPD_2;
-    else UPD = HPL_N_UPD;
+  /* ensure the temp buffer in HPL_pdfact is allocated once*/
+  lwork = (size_t)(((4 + ((unsigned int)(JB) << 1)) << 1));
+  if (PANEL->max_fwork_size < (size_t)(lwork) * sizeof(double))
+  {
+    if (PANEL->fWORK)
+    {
+      HIP_CHECK_ERROR(hipHostFree(PANEL->fWORK));
+    }
+    size_t numbytes = (size_t)(lwork) * sizeof(double);
 
-    if (UPD == HPL_LOOK_AHEAD && PANEL->grid->mycol != icurcol)
-      continue;
-    else
-      HPL_pdlaswp_hip(PANEL, UPD, phase);
+    HIP_CHECK_ERROR(hipHostMalloc((void **)&PANEL->fWORK, numbytes));
+    if (PANEL->fWORK == NULL)
+    {
+      HPL_pabort(__LINE__, "HPL_pdpanel_init", "Panel Host pdfact Scratch Memory allocation failed");
+    }
+    PANEL->max_fwork_size = (size_t)(lwork) * sizeof(double);
   }
+  /*
+   * End of HPL_pdpanel_init
+   */
 }
 
-void HIP::pdlaswp_set_var(HPL_T_panel* PANEL, double* &dU, double* &U, int &ldu, double* &dW, double* &W, int &ldw, int &n, double* &dA, const HPL_T_UPD UPD) {
-  switch (UPD) {
-  case HPL_LOOK_AHEAD:
-    dU = PANEL->dU; U = PANEL->U; ldu = PANEL->ldu0;
-    dW = PANEL->dW; W = PANEL->W; ldw = PANEL->ldu0;
-    n = PANEL->nu0;
-    break;
-  case HPL_UPD_1:
-    dU = PANEL->dU1;  U = PANEL->U1;  ldu = PANEL->ldu1;
-    dW = PANEL->dW1;  W = PANEL->W1;  ldw = PANEL->ldu1;
-    n = PANEL->nu1;
-    dA = Mptr(dA, 0, PANEL->nu0, PANEL->dlda);
-    break;
-  case HPL_UPD_2:
-    dU = PANEL->dU2;  U = PANEL->U2;  ldu = PANEL->ldu2;
-    dW = PANEL->dW2;  W = PANEL->W2;  ldw = PANEL->ldu2;
-    n = PANEL->nu2;
-    dA = Mptr(dA, 0, PANEL->nu0 + PANEL->nu1, PANEL->dlda);
-    break;
-  default:
-    break;
-  }
-}
\ No newline at end of file
diff --git a/testing/backend/HPL_backendWrapper.cpp b/testing/backend/HPL_backendWrapper.cpp
index 2d51ef3..0524317 100644
--- a/testing/backend/HPL_backendWrapper.cpp
+++ b/testing/backend/HPL_backendWrapper.cpp
@@ -6,14 +6,14 @@ extern "C" {
    /*
    * Initialize the Target device
    */
-   void HPL_BE_init(size_t num_devices, enum HPL_TARGET TR)
+   void HPL_BE_init(const HPL_T_grid* GRID, enum HPL_TARGET TR)
    {
       switch(TR) {
          case T_CPU:
             DO_NOTHING();
             break;
          case T_HIP:
-            HPL::dispatch(HIP::init, num_devices);
+            HPL::dispatch(HIP::init, GRID);
             break;
          default:
             DO_NOTHING();
diff --git a/testing/ptest/HPL_pddriver.cpp b/testing/ptest/HPL_pddriver.cpp
index 0b428c6..f942e8e 100644
--- a/testing/ptest/HPL_pddriver.cpp
+++ b/testing/ptest/HPL_pddriver.cpp
@@ -99,7 +99,7 @@ int main( ARGC, ARGV )
                               inbm, indh, indv, ipfa, ipq, irfa, itop,
                               mycol, myrow, ns, nbs, nbms, ndhs, ndvs,
                               npcol, npfs, npqs, nprow, nrfs, ntps, 
-                              rank, size, tswap;
+                              rank, size, tswap, p, q;
    HPL_T_ORDER                pmapping;
    HPL_T_FACT                 rpfa;
    HPL_T_SWAP                 fswap;
@@ -147,16 +147,9 @@ int main( ARGC, ARGV )
  * 1            Equilibration (0=no,1=yes)
  * 8            memory alignment in double (> 0)
  */
-   HPL_pdinfo( &test, &ns, nval, &nbs, nbval, &pmapping, &npqs, pval, qval,
-               &npfs, pfaval, &nbms, nbmval, &ndvs, ndvval, &nrfs, rfaval,
-               &ntps, topval, &ndhs, ndhval, &fswap, &tswap, &L1notran,
-               &Unotran, &equil, &align );
-
-   /*
-   * Init Target Device
-   */
-   HPL_BE_init(1, HPL_TR);
-
+   HPL_pdinfo( ARGC, ARGV, &test, &ns, nval, &nbs, nbval, &pmapping, &npqs, 
+               pval, qval, &p, &q, &npfs, pfaval, &nbms, nbmval, &ndvs, ndvval, &nrfs, rfaval, &ntps, topval, &ndhs, ndhval, &fswap, 
+               &tswap, &L1notran, &Unotran, &equil, &align );
 /*
  * Loop over different process grids - Define process grid. Go to bottom
  * of process grid loop if this case does not use my process.
@@ -164,11 +157,13 @@ int main( ARGC, ARGV )
    for( ipq = 0; ipq < npqs; ipq++ )
    {
       (void) HPL_grid_init( MPI_COMM_WORLD, pmapping, pval[ipq], qval[ipq],
-                            &grid );
+                            p, q, &grid );
       (void) HPL_grid_info( &grid, &nprow, &npcol, &myrow, &mycol );
 
       if( ( myrow < 0 ) || ( myrow >= nprow ) ||
           ( mycol < 0 ) || ( mycol >= npcol ) ) goto label_end_of_npqs;
+      
+      HPL_BE_init(&grid, HPL_TR);
 
       for( in = 0; in < ns; in++ )
       {                            /* Loop over various problem sizes */
diff --git a/testing/ptest/HPL_pdinfo.cpp b/testing/ptest/HPL_pdinfo.cpp
index 4eaab51..4938093 100644
--- a/testing/ptest/HPL_pdinfo.cpp
+++ b/testing/ptest/HPL_pdinfo.cpp
@@ -52,6 +52,8 @@
 #ifdef STDC_HEADERS
 void HPL_pdinfo
 (
+   int                              ARGC,
+   char**                           ARGV,
    HPL_T_test *                     TEST,
    int *                            NS,
    int *                            N,
@@ -61,6 +63,8 @@ void HPL_pdinfo
    int *                            NPQS,
    int *                            P,
    int *                            Q,
+   int *                            p,
+   int *                            q,
    int *                            NPFS,
    HPL_T_FACT *                     PF,
    int *                            NBMS,
@@ -274,7 +278,7 @@ void HPL_pdinfo
    int                        * iwork = NULL;
    char                       * lineptr;
    int                        error=0, fid, i, j, lwork, maxp, nprocs,
-                              rank, size;
+                              rank, size, _p = -1, _q = -1;
 /* ..
  * .. Executable Statements ..
  */
@@ -289,6 +293,20 @@ void HPL_pdinfo
  * Process 0 reads the input data, broadcasts to other processes and
  * writes needed information to TEST->outfp.
  */
+
+   for(int i = 1; i < ARGC; i++) {
+      if(strcmp(ARGV[i], "-p") == 0) {
+         _p = atoi(ARGV[i + 1]);
+         i++;
+      }
+    if(strcmp(ARGV[i], "-q") == 0) {
+      _q = atoi(ARGV[i + 1]);
+      i++;
+      }
+   }
+  *p = _p;
+  *q = _q;
+
    char* status;
    if( rank == 0 )
    {
@@ -346,6 +364,7 @@ void HPL_pdinfo
             error = 1; goto label_error;
          }
       }
+
 /*
  * Block size (>=1) (NB)
  */
diff --git a/testing/ptest/HPL_pdtest.cpp b/testing/ptest/HPL_pdtest.cpp
index eebc99e..0b118c1 100644
--- a/testing/ptest/HPL_pdtest.cpp
+++ b/testing/ptest/HPL_pdtest.cpp
@@ -158,7 +158,7 @@ void HPL_pdtest
 /*
  * generate matrix and right-hand-side, [ A | b ] which is N by N+1.
  */
-  MPI_Type_contiguous(2*NB+4, MPI_DOUBLE, &PDFACT_ROW);
+  MPI_Type_contiguous(NB+4, MPI_DOUBLE, &PDFACT_ROW);
   MPI_Type_commit(&PDFACT_ROW);
 
   HPL_BE_dmatgen(GRID, N, N+1, NB, mat.d_A, mat.ld, HPL_ISEED, T_HIP);

From 6d0d8896a0c537d28905d2c594048ce393bdebb6 Mon Sep 17 00:00:00 2001
From: Redtorm <songxiang.zhu@foxmail.com>
Date: Mon, 12 Sep 2022 08:22:29 +0200
Subject: [PATCH 2/3] change the process mapping to column major

---
 CMakeLists.txt                     | 39 ++++++++++++++++++------------
 scripts/config/HPL_16GPU.dat       |  2 +-
 scripts/config/HPL_1GPU.dat        |  2 +-
 scripts/config/HPL_2GPU.dat        |  2 +-
 scripts/config/HPL_32GPU.dat       |  2 +-
 scripts/config/HPL_4GPU.dat        |  2 +-
 scripts/config/HPL_8GPU.dat        |  4 +--
 scripts/env.mun.sh                 |  7 ++++++
 scripts/mpirun_xhplhip.sh          |  7 ++++--
 scripts/run_hpl.slurm              | 38 +++++++++++++++++------------
 scripts/run_xhplhip.sh             |  7 +++---
 src/grid/HPL_grid_init.cpp         |  2 +-
 testing/backend/HPL_backendHIP.cpp |  8 +++---
 testing/ptest/HPL_pddriver.cpp     |  2 +-
 testing/ptest/HPL_pdtest.cpp       |  2 +-
 15 files changed, 77 insertions(+), 49 deletions(-)
 create mode 100755 scripts/env.mun.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 84b31a6..f594214 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,10 +20,18 @@ EXECUTE_PROCESS(
 find_package(OpenMP)
 
 # Add rocM root dir to CMAKE_PREFIX_PATH, usually /opt/rocm
-list(APPEND CMAKE_PREFIX_PATH "/opt/rocm")
-include(/opt/rocm/lib/cmake/hip/FindHIP.cmake)
-include(/opt/rocm/share/rocm/cmake/ROCMCheckTargetIds.cmake)
-find_package( hip REQUIRED )
+if(NOT DEFINED ROCM_PATH)
+  if(DEFINED ENV{ROCM_PATH})
+    set(ROCM_PATH $ENV{ROCM_PATH})
+  else()
+    set(ROCM_PATH "/opt/rocm")
+  endif()
+endif()
+list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
+set(CMAKE_MODULE_PATH "${ROCM_PATH}/hip/cmake" ${CMAKE_MODULE_PATH})
+set(CMAKE_MODULE_PATH "${ROCM_PATH}/share/rocm/cmake" ${CMAKE_MODULE_PATH})
+include(ROCMCheckTargetIds)
+find_package( HIP REQUIRED )
 find_package( rocblas REQUIRED )
 
 # switch compiler and linker on non-Windows
@@ -83,15 +91,6 @@ else ()
     endif ()
 endif ()
 
-# find_library(BLAS_LIBRARIES NAMES blis
-#              PATHS ${BLAS_DIR}
-#              NO_DEFAULT_PATH)
-# if (BLAS_LIBRARIES)
-#   message(STATUS "Found BLAS: ${BLAS_LIBRARIES}")
-# else()
-#   find_package(BLAS REQUIRED)
-# endif()
-
 if(NOT DEFINED BLAS_DIR)
   if(DEFINED ENV{BLAS_DIR})
     set(BLAS_DIR $ENV{BLAS_DIR})
@@ -100,7 +99,18 @@ if(NOT DEFINED BLAS_DIR)
 else()
   list(APPEND CMAKE_PREFIX_PATH ${BLAS_DIR})
 endif()
-find_package( BLAS REQUIRED )
+
+find_library(BLAS_LIBRARIES NAMES blis openblas
+             PATHS ${BLAS_DIR}
+             HINTS ${BLAS_DIR}/lib/zen3 ${BLAS_DIR}/lib
+             NO_DEFAULT_PATH)
+if (BLAS_LIBRARIES)
+  message(STATUS "Found BLAS: ${BLAS_LIBRARIES}")
+else()
+  message(STATUS "BLAS NOT Found: ${BLAS_LIBRARIES}")
+  find_package(BLAS REQUIRED)
+endif()
+# find_package( BLAS REQUIRED )
 
 # append math library, if found
 find_library(MATH_LIBRARY m)
@@ -217,7 +227,6 @@ target_include_directories( xhplhip PUBLIC hip:device
     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
     $<INSTALL_INTERFACE:include>
 )
-# target_link_libraries( xhplhip roc::rocblas roc::rocrand ${BLAS_LIBRARIES} ${MPI_CXX_LIBRARIES} OpenMP::OpenMP_CXX /global/home/lulu/mun-node-3/blis-multi-thread/lib/zen3/libblis.so)
 target_link_libraries( xhplhip roc::rocblas roc::rocrand ${BLAS_LIBRARIES} ${MPI_CXX_LIBRARIES} OpenMP::OpenMP_CXX)
 
 configure_file( include/hplhip_config.hin ${CMAKE_CURRENT_SOURCE_DIR}/include/hplhip_config.h @ONLY NEWLINE_STYLE LF )
diff --git a/scripts/config/HPL_16GPU.dat b/scripts/config/HPL_16GPU.dat
index 679c241..665aa5e 100644
--- a/scripts/config/HPL_16GPU.dat
+++ b/scripts/config/HPL_16GPU.dat
@@ -6,7 +6,7 @@ HPL.out      output file name (if any)
 364032       N
 1            # of NBs
 384          NBs
-0            PMAP process mapping (0=Row-,1=Column-major)
+1            PMAP process mapping (0=Row-,1=Column-major)
 1            # of process grids (P x Q)
 4            Ps
 4            Qs
diff --git a/scripts/config/HPL_1GPU.dat b/scripts/config/HPL_1GPU.dat
index ee8e10a..3413c87 100644
--- a/scripts/config/HPL_1GPU.dat
+++ b/scripts/config/HPL_1GPU.dat
@@ -6,7 +6,7 @@ HPL.out      output file name (if any)
 91008        N
 1            # of NBs
 384          NBs
-0            PMAP process mapping (0=Row-,1=Column-major)
+1            PMAP process mapping (0=Row-,1=Column-major)
 1            # of process grids (P x Q)
 1            Ps
 1            Qs
diff --git a/scripts/config/HPL_2GPU.dat b/scripts/config/HPL_2GPU.dat
index 4897416..0362234 100644
--- a/scripts/config/HPL_2GPU.dat
+++ b/scripts/config/HPL_2GPU.dat
@@ -6,7 +6,7 @@ HPL.out      output file name (if any)
 128256        N
 1            # of NBs
 384          NBs
-0            PMAP process mapping (0=Row-,1=Column-major)
+1            PMAP process mapping (0=Row-,1=Column-major)
 1            # of process grids (P x Q)
 2            Ps
 1            Qs
diff --git a/scripts/config/HPL_32GPU.dat b/scripts/config/HPL_32GPU.dat
index cf14244..43fb13a 100644
--- a/scripts/config/HPL_32GPU.dat
+++ b/scripts/config/HPL_32GPU.dat
@@ -6,7 +6,7 @@ HPL.out      output file name (if any)
 513024       N
 1            # of NBs
 384          NBs
-0            PMAP process mapping (0=Row-,1=Column-major)
+1            PMAP process mapping (0=Row-,1=Column-major)
 1            # of process grids (P x Q)
 8            Ps
 4            Qs
diff --git a/scripts/config/HPL_4GPU.dat b/scripts/config/HPL_4GPU.dat
index 6eb090a..e336bbd 100644
--- a/scripts/config/HPL_4GPU.dat
+++ b/scripts/config/HPL_4GPU.dat
@@ -6,7 +6,7 @@ HPL.out      output file name (if any)
 180864        N
 1            # of NBs
 384          NBs
-0            PMAP process mapping (0=Row-,1=Column-major)
+1            PMAP process mapping (0=Row-,1=Column-major)
 1            # of process grids (P x Q)
 2            Ps
 2            Qs
diff --git a/scripts/config/HPL_8GPU.dat b/scripts/config/HPL_8GPU.dat
index 575e1a9..3d3c3c6 100644
--- a/scripts/config/HPL_8GPU.dat
+++ b/scripts/config/HPL_8GPU.dat
@@ -6,7 +6,7 @@ HPL.out      output file name (if any)
 256512       N
 1            # of NBs
 384          NBs
-0            PMAP process mapping (0=Row-,1=Column-major)
+1            PMAP process mapping (0=Row-,1=Column-major)
 1            # of process grids (P x Q)
 4            Ps
 2            Qs
@@ -19,7 +19,7 @@ HPL.out      output file name (if any)
 2            NDIVs
 1            # of recursive panel fact.
 2            RFACTs (0=left, 1=Crout, 2=Right)
-8            # of broadcast
+1            # of broadcast
 6            BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM,6=ibcast,7=BiDir)
 1            # of lookahead depth
 1            DEPTHs (>=0)
diff --git a/scripts/env.mun.sh b/scripts/env.mun.sh
new file mode 100755
index 0000000..5ef304e
--- /dev/null
+++ b/scripts/env.mun.sh
@@ -0,0 +1,7 @@
+module reset
+
+module load rocm/5.3.0-10584
+
+# export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/global/software/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.4.0/openblas-0.3.20-qbm5uv3ntjerkx4jzrprmelytviwoq2e/lib:/global/software/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.4.0/openmpi-4.1.4-3z7jsddbvczl4duixalzrtap3q5nuvjk/lib"
+export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/global/home/lulu/blis/lib:/global/home/lulu/ompi/lib:/global/software_internal/rocm/rocm-5.3.0-10584/lib"
+# export MPICH_GPU_SUPPORT_ENABLED=1
diff --git a/scripts/mpirun_xhplhip.sh b/scripts/mpirun_xhplhip.sh
index 617143a..8e0fc76 100644
--- a/scripts/mpirun_xhplhip.sh
+++ b/scripts/mpirun_xhplhip.sh
@@ -7,6 +7,9 @@ mpi_bin=${mpi_dir}/bin/mpiexec
 mpi_lib=${mpi_dir}/lib
 hpl_runscript=./run_xhplhip.sh
 
+if [ -z "${ROCM_PATH}" ]; then rocm_dir="/opt/rocm/lib";
+else rocm_dir="${ROCM_PATH}/lib"; fi
+
 filename=HPL.dat
 
 P=$(sed -n "11, 1p" ${filename} | awk '{print $1}')
@@ -18,9 +21,9 @@ num_cpu_sockets=$(lscpu | grep Socket | awk '{print $2}')
 total_cpu_cores=$(($num_cpu_cores*$num_cpu_sockets))
 
 export LD_LIBRARY_PATH=${mpi_lib}:$LD_LIBRARY_PATH
-export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH="${rocm_dir}":$LD_LIBRARY_PATH
 #Default MPI options
-mpi_args="--map-by slot:PE=${total_cpu_cores} --bind-to core:overload-allowed --mca btl ^openib --mca pml ucx -x LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib ${mpi_args}"
+mpi_args="--map-by slot:PE=${total_cpu_cores} --bind-to core:overload-allowed --mca btl ^openib --mca pml ucx -x LD_LIBRARY_PATH="${rocm_dir}/lib":$LD_LIBRARY_PATH ${mpi_args}"
 
 ${mpi_bin} --allow-run-as-root -np ${np} ${mpi_args} ${hpl_runscript} 
 # ${mpi_bin} --hostfile hostfile --allow-run-as-root -np ${np} ${mpi_args} ${hpl_runscript} 
diff --git a/scripts/run_hpl.slurm b/scripts/run_hpl.slurm
index 381779b..7e8f995 100755
--- a/scripts/run_hpl.slurm
+++ b/scripts/run_hpl.slurm
@@ -5,14 +5,16 @@
 #SBATCH -c 8
 #SBATCH -t 2:00:00
 ##SBATCH -A VEN114
-#SBATCH -A project_462000075
 #SBATCH -J xhplhip
 #SBATCH --gpu-bind=closest
 #SBATCH --ntasks-per-node=8
 #SBATCH --gpus-per-node=8
+#SBATCH -p MI250-x4-IB
+#SBATCH -w mun-node-4
+#SBATCH --exclusive
 
 #source ../env/env.crusher.sh
-source ../env/env.lumi.sh
+source env.mun.sh
 
 NP=$SLURM_NPROCS
 NODES=$SLURM_NNODES
@@ -21,20 +23,24 @@ LOG=log.hpl-gpu-${NP}np-${HOSTNAME}-${DATE}.txt
 
 cp config/HPL_${NP}GPU.dat HPL.dat
 
-EXE="../build/xhplhip"
-CMD=""
-CMD+="srun "
-CMD+="-v "
-CMD+="-n $NP "
-CMD+="-N $NODES "
-CMD+="-A VEN114 "
-CMD+="--gpu-bind=closest "
-CMD+="--ntasks-per-node=8 "
-CMD+="--gpus-per-node=8 "
-CMD+="-c 8 "
-CMD+="-o $LOG -e $LOG "
-#CMD+="${HOME}/mpich_bind.sh "
-CMD+="$EXE"
+# EXE="../build/xhplhip"
+# CMD=""
+# CMD+="srun "
+# CMD+="-v "
+# CMD+="-n $NP "
+# CMD+="-N $NODES "
+# CMD+="-A VEN114 "
+# CMD+="--gpu-bind=closest "
+# CMD+="--ntasks-per-node=8 "
+# CMD+="--gpus-per-node=8 "
+# CMD+="-c 8 "
+# CMD+="-o $LOG -e $LOG "
+# #CMD+="${HOME}/mpich_bind.sh "
+# CMD+="$EXE"
+
+sh mpirun_xhplhip.sh
+
+
 
 if [ $NODES -gt 8 ]; then
     echo "export FI_MR_CACHE_MAX_COUNT=0"
diff --git a/scripts/run_xhplhip.sh b/scripts/run_xhplhip.sh
index ea4d314..c48d2f8 100755
--- a/scripts/run_xhplhip.sh
+++ b/scripts/run_xhplhip.sh
@@ -2,8 +2,9 @@
 # set -x #echo on
 
 hpl_bin=./xhplhip
-rocblas_dir=/opt/rocm-5.2.0/lib
-blas_dir=/global/home/lulu/hyc/rocHPL-main/tpl/blis/lib
+if [ -z "${ROCM_PATH}" ]; then rocblas_dir="/opt/rocm/lib"
+else rocblas_dir="${ROCM_PATH}/lib"; fi
+blas_dir="${BLAS_DIR}/lib/zen3"
 
 filename=./HPL.dat
 p=-1
@@ -229,4 +230,4 @@ if [[ $globalRank -lt $size ]]; then
   echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] CPU Cores: $omp_num_threads - $places"
 fi
 #run
-${hpl_bin} 
+${hpl_bin} -p ${p} -q ${q}
diff --git a/src/grid/HPL_grid_init.cpp b/src/grid/HPL_grid_init.cpp
index 2130c0a..7c20055 100644
--- a/src/grid/HPL_grid_init.cpp
+++ b/src/grid/HPL_grid_init.cpp
@@ -140,7 +140,7 @@ int HPL_grid_init
       local_mycol = local_rank % q; local_myrow = local_rank / q;
       int noderow = node / (NPCOL / q); int nodecol = node % (NPCOL / q);
       myrow = noderow * p + local_myrow; mycol = nodecol * q + local_mycol;
-      myrow = rank / NPCOL; mycol = rank - myrow * NPCOL;
+      // myrow = rank / NPCOL; mycol = rank - myrow * NPCOL;
    }
    else
    {
diff --git a/testing/backend/HPL_backendHIP.cpp b/testing/backend/HPL_backendHIP.cpp
index 36b07ee..b425d75 100644
--- a/testing/backend/HPL_backendHIP.cpp
+++ b/testing/backend/HPL_backendHIP.cpp
@@ -35,6 +35,7 @@ void HIP::init(const HPL_T_grid* GRID)
     HIP_CHECK_ERROR(hipGetDeviceCount(&count));
     //TODO: set dynamic device id
     int device_id = localRank % count; 
+    // printf("host: %s, device id: %d, myrow: %d, mycol: %d\n", host_name, device_id, GRID->local_myrow, GRID->local_mycol);
     HIP_CHECK_ERROR(hipSetDevice(device_id));
 
     // Get device properties
@@ -1653,7 +1654,7 @@ void HIP::panel_init(HPL_T_grid *GRID, HPL_T_palg *ALGO, const int M, const int
    */
   PANEL->ldl2 = 0;  /* local leading dim of array L2 */
   PANEL->dldl2 = 0; /* local leading dim of array L2 */
-  PANEL->dldl1 = 1.02 * A->dN; // padding
+  PANEL->dldl1 = 1.015 * A->dN; // padding to avoid stack overflow
   PANEL->len = 0;   /* length of the buffer to broadcast */
   PANEL->nu0 = 0;
   PANEL->nu1 = 0;
@@ -1666,9 +1667,10 @@ void HIP::panel_init(HPL_T_grid *GRID, HPL_T_palg *ALGO, const int M, const int
   /*Split fraction*/
   const double fraction = 0.6;
 
-  if ((double)M / A->dN > 0.97) {
+  // get the panel init time
+  if ((double)M / A->dN > 0.985) {
     HPL_ptimer_boot();
-    HPL_ptimer( 0 );
+    HPL_ptimer( HPL_rzero );
   }
   dalign = ALGO->align * sizeof(double);
   size_t lpiv = (5 * JB * sizeof(int) + sizeof(double) - 1) / (sizeof(double));
diff --git a/testing/ptest/HPL_pddriver.cpp b/testing/ptest/HPL_pddriver.cpp
index f942e8e..3ddc5c0 100644
--- a/testing/ptest/HPL_pddriver.cpp
+++ b/testing/ptest/HPL_pddriver.cpp
@@ -220,7 +220,7 @@ int main( ARGC, ARGV )
 
               algo.fswap = fswap; algo.fsthr = tswap;
               algo.equil = equil; algo.align = align;
-
+              
               HPL_pdtest( &test, &grid, &algo, nval[in], nbval[inb] );
 
              }
diff --git a/testing/ptest/HPL_pdtest.cpp b/testing/ptest/HPL_pdtest.cpp
index afea16a..e25ac5f 100644
--- a/testing/ptest/HPL_pdtest.cpp
+++ b/testing/ptest/HPL_pdtest.cpp
@@ -230,6 +230,7 @@ void HPL_pdtest
    (void) vsip_blockrelease_d( mat.block, VSIP_TRUE ); 
    vsip_blockdestroy_d( mat.block );
 #endif
+
 /*
  * Gather max of all CPU and WALL clock timings and print timing results
  */
@@ -510,7 +511,6 @@ void HPL_pdtest
       HPL_fprintf( TEST->outfp, "%s%16.8e%s%s\n",
          "||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)= ", resid1,
          " ...... ", ( resid1 < TEST->thrsh ? "PASSED" : "FAILED" ) );
-
       if( resid1 >= TEST->thrsh ) 
       {
          HPL_fprintf( TEST->outfp, "%s%18.6f\n",

From 3bc70c6a8c6dba627ab665740b8a50cce831cee2 Mon Sep 17 00:00:00 2001
From: Redtorm <songxiang.zhu@foxmail.com>
Date: Mon, 12 Sep 2022 10:10:06 +0200
Subject: [PATCH 3/3] add slurm running script for lumi

---
 HPL.dat                                      |  2 +-
 scripts/run_hpl_lumi.slurm                   | 56 ++++++++++++++++++++
 scripts/{run_hpl.slurm => run_hpl_mun.slurm} |  5 +-
 3 files changed, 58 insertions(+), 5 deletions(-)
 create mode 100644 scripts/run_hpl_lumi.slurm
 rename scripts/{run_hpl.slurm => run_hpl_mun.slurm} (95%)

diff --git a/HPL.dat b/HPL.dat
index 3f8575c..4033a9a 100755
--- a/HPL.dat
+++ b/HPL.dat
@@ -3,7 +3,7 @@ Innovative Computing Laboratory, University of Tennessee
 HPL.out      output file name (if any)
 0            device out (6=stdout,7=stderr,file)
 1            # of problems sizes (N)
-256000        N
+256128        N
 1            # of NBs
 384          NBs
 1            PMAP process mapping (0=Row-,1=Column-major)
diff --git a/scripts/run_hpl_lumi.slurm b/scripts/run_hpl_lumi.slurm
new file mode 100644
index 0000000..a6f2d28
--- /dev/null
+++ b/scripts/run_hpl_lumi.slurm
@@ -0,0 +1,56 @@
+#!/bin/bash
+#SBATCH -v
+#SBATCH -N 2
+#SBATCH -n 16
+#SBATCH -c 8
+#SBATCH -t 1:00:00
+#SBATCH -A VEN114
+#SBATCH -J xhplhip
+#SBATCH --gpu-bind=closest
+#SBATCH --job-name=hpl_gpu   # Job name
+#SBATCH --output=hpl.o%j # Name of stdout output file
+#SBATCH --error=hpl.e%j  # Name of stderr error file
+#SBATCH --partition=gpu    # Partition (queue) name
+#SBATCH --ntasks-per-node=8
+#SBATCH --gpus-per-node=8
+#SBATCH --time=0-01:00:00      # Run time (d-hh:mm:ss)
+#SBATCH --account=project_462000075  # Project for billing
+#SBATCH --exclusive
+
+source ../env/env.lumi.sh
+
+export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"
+export MPICH_GPU_SUPPORT_ENABLED=1
+
+NP=$SLURM_NPROCS
+NODES=$SLURM_NNODES
+DATE=$(date +%y%m%d-%H%M%S)
+LOG=log.hpl-gpu-${NP}np-${HOSTNAME}-${DATE}.txt
+
+cp config/HPL_${NP}GPU.dat HPL.dat
+
+EXE="./xhplhip -p 4 -q 2"
+CMD=""
+CMD+="srun "
+CMD+="-v "
+CMD+="-n $NP "
+CMD+="-N $NODES "
+# CMD+="-A VEN114 "
+CMD+="--gpu-bind=closest "
+CMD+="--ntasks-per-node=8 "
+CMD+="--gpus-per-node=8 "
+CMD+="--exclusive "
+CMD+="-c 8 "
+CMD+="-o $LOG -e $LOG "
+#CMD+="${HOME}/mpich_bind.sh "
+CMD+="$EXE"
+
+#export MPICH_SMP_SINGLE_COPY_MODE=NONE # does not work
+export FI_MR_CACHE_MAX_COUNT=0
+export MPICH_RANK_REORDER_DISPLAY=1
+
+echo $CMD >> $LOG
+echo $CMD 2>&1 | tee -a $LOG
+     $CMD 2>&1 | tee -a $LOG
+cat HPL.dat 2>&1 | tee -a $LOG
+cat HPL.out 2>&1 | tee -a $LOG
\ No newline at end of file
diff --git a/scripts/run_hpl.slurm b/scripts/run_hpl_mun.slurm
similarity index 95%
rename from scripts/run_hpl.slurm
rename to scripts/run_hpl_mun.slurm
index 7e8f995..2b59355 100755
--- a/scripts/run_hpl.slurm
+++ b/scripts/run_hpl_mun.slurm
@@ -13,7 +13,6 @@
 #SBATCH -w mun-node-4
 #SBATCH --exclusive
 
-#source ../env/env.crusher.sh
 source env.mun.sh
 
 NP=$SLURM_NPROCS
@@ -38,9 +37,7 @@ cp config/HPL_${NP}GPU.dat HPL.dat
 # #CMD+="${HOME}/mpich_bind.sh "
 # CMD+="$EXE"
 
-sh mpirun_xhplhip.sh
-
-
+bash mpirun_xhplhip.sh
 
 if [ $NODES -gt 8 ]; then
     echo "export FI_MR_CACHE_MAX_COUNT=0"