From b9115429a6ddac21acd4e5c01f642d922def7dd4 Mon Sep 17 00:00:00 2001 From: Redtorm Date: Fri, 9 Sep 2022 10:48:36 +0200 Subject: [PATCH 1/3] optimize the one-node performance --- CMakeLists.txt | 6 +- HPL.dat | 6 +- include/backend/hpl_backendHIP.h | 6 +- include/backend/hpl_backendWrapper.h | 2 +- include/hpl_grid.h | 6 + include/hpl_panel.h | 1 + include/hpl_pgesv.h | 1 + include/hpl_ptest.h | 4 + scripts/mpirun_xhplhip.sh | 2 +- scripts/run_xhplhip.sh | 206 ++- src/grid/HPL_grid_init.cpp | 20 +- src/pgesv/HPL_pdgesvK2_HIP.cpp | 2 +- testing/backend/HPL_backendHIP.cpp | 1731 ++++++++++++------------ testing/backend/HPL_backendWrapper.cpp | 4 +- testing/ptest/HPL_pddriver.cpp | 19 +- testing/ptest/HPL_pdinfo.cpp | 21 +- testing/ptest/HPL_pdtest.cpp | 2 +- 17 files changed, 1112 insertions(+), 927 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 930e8f0..46097de 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,7 +21,7 @@ find_package(OpenMP) # Add rocM root dir to CMAKE_PREFIX_PATH, usually /opt/rocm list(APPEND CMAKE_PREFIX_PATH "/opt/rocm") -include(/opt/rocm/hip/cmake/FindHIP.cmake) +include(/opt/rocm/lib/cmake/hip/FindHIP.cmake) include(/opt/rocm/share/rocm/cmake/ROCMCheckTargetIds.cmake) find_package( hip REQUIRED ) find_package( rocblas REQUIRED ) @@ -212,12 +212,12 @@ hip_add_executable( xhplhip ${hpl_device_source} ${hpl_host_source}) target_compile_options(xhplhip PRIVATE ${CMAKE_HOST_FLAGS}) target_include_directories( xhplhip PUBLIC hip:device - ${HIP_ROOT_DIR}/include + # ${HIP_ROOT_DIR}/include ${HPLHIP_DEVRAND_INCLUDE_DIRS} $ $ ) -# target_link_libraries( xhplhip roc::rocblas roc::rocrand ${BLAS_LIBRARIES} ${MPI_CXX_LIBRARIES} /global/home/lulu/mun-node-3/blis-multi-thread/lib/zen3/libblis.so) +# target_link_libraries( xhplhip roc::rocblas roc::rocrand ${BLAS_LIBRARIES} ${MPI_CXX_LIBRARIES} OpenMP::OpenMP_CXX /global/home/lulu/mun-node-3/blis-multi-thread/lib/zen3/libblis.so) target_link_libraries( xhplhip roc::rocblas roc::rocrand ${BLAS_LIBRARIES} ${MPI_CXX_LIBRARIES} OpenMP::OpenMP_CXX) configure_file( include/hplhip_config.hin ${CMAKE_CURRENT_SOURCE_DIR}/include/hplhip_config.h @ONLY NEWLINE_STYLE LF ) diff --git a/HPL.dat b/HPL.dat index 71bd3c2..3f8575c 100755 --- a/HPL.dat +++ b/HPL.dat @@ -6,10 +6,10 @@ HPL.out output file name (if any) 256000 N 1 # of NBs 384 NBs -0 PMAP process mapping (0=Row-,1=Column-major) +1 PMAP process mapping (0=Row-,1=Column-major) 1 # of process grids (P x Q) -2 Ps -4 Qs +4 Ps +2 Qs 16.0 threshold 1 # of panel fact 2 PFACTs (0=left, 1=Crout, 2=Right) diff --git a/include/backend/hpl_backendHIP.h b/include/backend/hpl_backendHIP.h index 6c75ada..1df8ed2 100644 --- a/include/backend/hpl_backendHIP.h +++ b/include/backend/hpl_backendHIP.h @@ -2,9 +2,9 @@ #include #if defined(HPLHIP_USE_ROCRAND) -#include +#include #endif -#include +#include #include #include @@ -81,7 +81,7 @@ enum SWP_PHASE { }; namespace HIP { - void init(size_t); + void init(const HPL_T_grid*); void release(); void malloc(void**, size_t); diff --git a/include/backend/hpl_backendWrapper.h b/include/backend/hpl_backendWrapper.h index 785a793..206aeac 100644 --- a/include/backend/hpl_backendWrapper.h +++ b/include/backend/hpl_backendWrapper.h @@ -26,7 +26,7 @@ UPDATE_LOOK_AHEAD, L1TRANSFER, L2TRANSFER, DGEMMSTART, DGEMMSTOP, UPDATE, SWAPST enum HPL_STREAM {HPL_COMPUTESTREAM, HPL_DATASTREAM, HPL_PDLASWPSTREAM}; -void HPL_BE_init(size_t, enum HPL_TARGET); +void HPL_BE_init(const HPL_T_grid*, enum HPL_TARGET); void HPL_BE_malloc(void**, size_t, enum HPL_TARGET); diff --git a/include/hpl_grid.h b/include/hpl_grid.h index 1895a5e..3c1397f 100644 --- a/include/hpl_grid.h +++ b/include/hpl_grid.h @@ -84,6 +84,10 @@ typedef struct HPL_S_grid int col_hdim; /* col_ip2 procs hypercube dimension */ int col_ip2m1; /* largest power of two <= npcol-1 */ int col_mask; /* col_ip2m1 procs hypercube mask */ + int local_myrow; + int local_mycol; + int local_nprow; + int local_npcol; } HPL_T_grid; /* @@ -132,6 +136,8 @@ STDC_ARGS( ( const HPL_T_ORDER, const int, const int, + const int, + const int, HPL_T_grid * ) ); int HPL_grid_exit diff --git a/include/hpl_panel.h b/include/hpl_panel.h index fffb496..b3c12e5 100644 --- a/include/hpl_panel.h +++ b/include/hpl_panel.h @@ -125,6 +125,7 @@ typedef struct HPL_S_panel int dlda; double* U2; /* ptr to U2 */ double* dU2; + int dldl1; int dldl2; int ldu1; int ldu2; diff --git a/include/hpl_pgesv.h b/include/hpl_pgesv.h index 2acf3cc..6d251eb 100644 --- a/include/hpl_pgesv.h +++ b/include/hpl_pgesv.h @@ -121,6 +121,7 @@ typedef struct HPL_S_pmat double * d_X; /* device pointer to solution vector */ double * W; double * dW; + int dN; } HPL_T_pmat; /* * --------------------------------------------------------------------- diff --git a/include/hpl_ptest.h b/include/hpl_ptest.h index 5777bd5..1f13e5d 100644 --- a/include/hpl_ptest.h +++ b/include/hpl_ptest.h @@ -108,6 +108,8 @@ typedef struct HPL_S_test */ void HPL_pdinfo STDC_ARGS( ( + int, + char**, HPL_T_test *, int *, int *, @@ -118,6 +120,8 @@ STDC_ARGS( ( int *, int *, int *, + int *, + int *, HPL_T_FACT *, int *, int *, diff --git a/scripts/mpirun_xhplhip.sh b/scripts/mpirun_xhplhip.sh index e096823..617143a 100644 --- a/scripts/mpirun_xhplhip.sh +++ b/scripts/mpirun_xhplhip.sh @@ -20,7 +20,7 @@ total_cpu_cores=$(($num_cpu_cores*$num_cpu_sockets)) export LD_LIBRARY_PATH=${mpi_lib}:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH #Default MPI options -mpi_args="--map-by slot:PE=${total_cpu_cores} --bind-to core:overload-allowed --mca btl ^openib --mca pml ucx --report-bindings -x LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib ${mpi_args}" +mpi_args="--map-by slot:PE=${total_cpu_cores} --bind-to core:overload-allowed --mca btl ^openib --mca pml ucx -x LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib ${mpi_args}" ${mpi_bin} --allow-run-as-root -np ${np} ${mpi_args} ${hpl_runscript} # ${mpi_bin} --hostfile hostfile --allow-run-as-root -np ${np} ${mpi_args} ${hpl_runscript} diff --git a/scripts/run_xhplhip.sh b/scripts/run_xhplhip.sh index 777801f..ea4d314 100755 --- a/scripts/run_xhplhip.sh +++ b/scripts/run_xhplhip.sh @@ -2,11 +2,12 @@ # set -x #echo on hpl_bin=./xhplhip -rocblas_dir=/opt/rocm/rocblas/lib -blas_dir=$HOME/OpenBLAS +rocblas_dir=/opt/rocm-5.2.0/lib +blas_dir=/global/home/lulu/hyc/rocHPL-main/tpl/blis/lib filename=./HPL.dat - +p=-1 +q=-1 export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:$LD_LIBRARY_PATH oversubscribe=true @@ -15,12 +16,15 @@ P=$(sed -n "11, 1p" ${filename} | awk '{print $1}') Q=$(sed -n "12, 1p" ${filename} | awk '{print $1}') np=$(($P*$Q)) -# Get local process numbering set +u if [[ -n ${OMPI_COMM_WORLD_LOCAL_RANK+x} ]]; then + globalRank=$OMPI_COMM_WORLD_RANK + globalSize=$OMPI_COMM_WORLD_SIZE rank=$OMPI_COMM_WORLD_LOCAL_RANK size=$OMPI_COMM_WORLD_LOCAL_SIZE elif [[ -n ${SLURM_LOCALID+x} ]]; then + globalRank=$SLURM_PROCID + globalSize=$SLURM_NTASKS rank=$SLURM_LOCALID size=$SLURM_TASKS_PER_NODE #Slurm can return a string like "2(x2),1". Get the first number @@ -28,69 +32,201 @@ elif [[ -n ${SLURM_LOCALID+x} ]]; then fi set -u -# count the number of physical cores on node -num_cpu_cores=$(lscpu | grep "Core(s)" | awk '{print $4}') -num_cpu_sockets=$(lscpu | grep Socket | awk '{print $2}') -total_cpu_cores=$(($num_cpu_cores*$num_cpu_sockets)) +#Determing node-local grid size +if [[ "$p" -lt 1 && "$q" -lt 1 ]]; then + # no node-local grid was specified, pick defaults + q=$(( (Q<=size) ? Q : size)) + + if [[ $((size % q)) -gt 0 ]]; then + echo "Invalid MPI grid parameters; Unable to form node-local grid; aborting"; + exit 1 + fi + + p=$(( size/q )) + +elif [[ "$p" -lt 1 ]]; then + #q was specified + + if [[ $((size % q)) -gt 0 ]]; then + echo "Invalid MPI grid parameters; Unable to form node-local grid; aborting"; + exit 1 + fi + + p=$(( size/q )) + +elif [[ "$q" -lt 1 ]]; then + #p was specified -# We assume a row-major process mapping to nodes -columns_per_node=$(( Q < size ? Q : size )) + if [[ $((size % p)) -gt 0 ]]; then + echo "Invalid MPI grid parameters; Unable to form node-local grid; aborting"; + exit 1 + fi + + q=$(( size/p )) + +else + #Both p and q were specified + if [[ $size -ne $((p*q)) ]]; then + echo "Invalid MPI grid parameters; Unable to form node-local grid; aborting"; + exit 1 + fi +fi # Check that the columns are evenly divided among nodes -if [[ $((Q % columns_per_node)) -gt 0 ]]; then - echo "Invalid MPI grid parameters; Must have the same number of Q columns on every node; aborting"; +if [[ $((P % p)) -gt 0 ]]; then + echo "Invalid MPI grid parameters; Must have the same number of P rows on every node; aborting"; exit 1 fi # Check that the rows are evenly divided among nodes -if [[ $((size % columns_per_node)) -gt 0 ]]; then - echo "Invalid MPI grid parameters; Must have the same number of P rows on every node; aborting"; +if [[ $((Q % q)) -gt 0 ]]; then + echo "Invalid MPI grid parameters; Must have the same number of Q columns on every node; aborting"; exit 1 fi -rows_per_node=$(( size/columns_per_node )) +# count the number of physical cores on node +num_cpu_cores=$(lscpu | grep "Core(s)" | awk '{print $4}') +num_cpu_sockets=$(lscpu | grep Socket | awk '{print $2}') +total_cpu_cores=$(($num_cpu_cores*$num_cpu_sockets)) # Ranks in different processes rows will take distinct chunks of cores -row_stride=$((total_cpu_cores/rows_per_node)) -col_stride=$((row_stride/columns_per_node)) +row_stride=$((total_cpu_cores/p)) +col_stride=$((row_stride/q)) -myP=$((rank/columns_per_node)) -myQ=$((rank%columns_per_node)) +myp=$((rank%p)) +myq=$((rank/p)) -# Default core mapping +#Although ranks are column-major order, we select GPUs in row-major order on node +mygpu=$((myq+myp*q)) -root_core=$((myP*row_stride + myQ*col_stride)) +# Try to detect special Bard-peak core mapping +if [[ -n ${HPL_PLATFORM+x} ]]; then + platform=$HPL_PLATFORM +else + platform=$(cat /sys/class/dmi/id/product_name) +fi -omp_num_threads=${col_stride} -# First omp place is the root core -omp_places="{$root_core}" +if [[ "$platform" == "BardPeak" || "$platform" == "HPE_CRAY_EX235A" ]]; then + # Special core mapping for BardPeak + + # Debug + # if [[ $globalRank == 0 ]]; then + # echo "BardPeak platform detected" + # fi + + # Sanity check + if [[ $size -gt 8 ]]; then + echo "Unsupported number of ranks on BardPeak platform; aborting"; + exit 1 + fi + + # GCD0 cores="48-55" + # GCD1 cores="56-63" + # GCD2 cores="16-23" + # GCD3 cores="24-31" + # GCD4 cores="0-7" + # GCD5 cores="8-15" + # GCD6 cores="32-39" + # GCD7 cores="40-47" + + root_cores=(48 56 16 24 0 8 32 40) + root_core=${root_cores[mygpu]} + + # First omp place is the root core + omp_places="{$root_core}" + + # First assign the CCD + for i in $(seq $((root_core+1)) $((root_core+8-1))) + do + omp_places+=",{$i}" + done + omp_num_threads=8 + + places="{$root_core-$((root_core+7))}" + + # Loop through unassigned CCDs + for c in $(seq $((mygpu+size)) $size 7) + do + iroot_core=${root_cores[c]} + for i in $(seq $((iroot_core)) $((iroot_core+8-1))) + do + omp_places+=",{$i}" + done + omp_num_threads=$((omp_num_threads+8)) + places+=",{$iroot_core-$((iroot_core+7))}" + done -# Make contiuguous chunk of cores (to maximize L1/L2 locality) -for i in $(seq $((root_core+1)) $((root_core+col_stride-1))) -do - omp_places+=",{$i}" -done + if [[ "${oversubscribe}" == true ]]; then + # Add cores from different columns, without their root cores + for j in $(seq 0 $((q-1))) + do + if [[ "$j" == "$myq" ]]; then + continue + fi + for jj in $(seq 0 $size 7) + do + q_gpu=$((jj+j+myp*q)) + q_core=$((root_cores[q_gpu])) + offset=$(( (q_gpu>=size) ? 0 : 1)) + for i in $(seq $((q_core+offset)) $((q_core+8-1))) + do + omp_places+=",{$i}" + done + omp_num_threads=$((omp_num_threads+8-offset)) + places+=",{$((q_core+offset))-$((q_core+7))}" + done + done + fi -if [[ "${oversubscribe}" == true ]]; then +else + # Default core mapping + root_core=$((myp*row_stride + myq*col_stride)) + + omp_num_threads=${col_stride} + # First omp place is the root core + omp_places="{$root_core}" + + # Make contiuguous chunk of cores (to maximize L1/L2 locality) + for i in $(seq $((root_core+1)) $((root_core+col_stride-1))) + do + omp_places+=",{$i}" + done + + if [[ $col_stride -gt 1 ]]; then + places="{$root_core-$((root_core+col_stride-1))}" + else + places="{$root_core}" + fi + + if [[ "${oversubscribe}" == true ]]; then # Add cores from different columns, without their root cores - for q in $(seq 0 $((columns_per_node-1))) + for j in $(seq 0 $((q-1))) do - if [[ "$q" == "$myQ" ]]; then + if [[ "$j" == "$myq" ]]; then continue fi - q_core=$((myP*row_stride + q*col_stride)) + q_core=$((myp*row_stride + j*col_stride)) for i in $(seq $((q_core+1)) $((q_core+col_stride-1))) do omp_places+=",{$i}" done omp_num_threads=$((omp_num_threads+col_stride-1)) + + if [[ $col_stride -gt 2 ]]; then + places+=",{$((q_core+1))-$((q_core+col_stride-1))}" + elif [[ $col_stride -gt 1 ]]; then + places+=",{$((q_core+1))}" + fi + done + fi fi - # Export OpenMP config export OMP_NUM_THREADS=${omp_num_threads} export OMP_PLACES=${omp_places} export OMP_PROC_BIND=true - +if [[ $globalRank -lt $size ]]; then + echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] CPU Cores: $omp_num_threads - $places" +fi #run ${hpl_bin} diff --git a/src/grid/HPL_grid_init.cpp b/src/grid/HPL_grid_init.cpp index 52111ac..2130c0a 100644 --- a/src/grid/HPL_grid_init.cpp +++ b/src/grid/HPL_grid_init.cpp @@ -56,6 +56,8 @@ int HPL_grid_init const HPL_T_ORDER ORDER, const int NPROW, const int NPCOL, + const int p, + const int q, HPL_T_grid * GRID ) #else @@ -65,6 +67,8 @@ int HPL_grid_init const HPL_T_ORDER ORDER; const int NPROW; const int NPCOL; + const int p, + const int q, HPL_T_grid * GRID; #endif { @@ -111,7 +115,8 @@ int HPL_grid_init * .. Local Variables .. */ int hdim, hplerr=MPI_SUCCESS, ierr, ip2, k, - mask, mycol, myrow, nprocs, rank, size; + mask, mycol, myrow, nprocs, rank, size, + local_myrow, local_mycol; /* .. * .. Executable Statements .. */ @@ -125,18 +130,29 @@ int HPL_grid_init /* * Row- or column-major ordering of the processes */ + int local_size = p * q; + int local_rank = rank % local_size; + int node = rank / local_size; // node number + if( ORDER == HPL_ROW_MAJOR ) { GRID->order = HPL_ROW_MAJOR; + local_mycol = local_rank % q; local_myrow = local_rank / q; + int noderow = node / (NPCOL / q); int nodecol = node % (NPCOL / q); + myrow = noderow * p + local_myrow; mycol = nodecol * q + local_mycol; myrow = rank / NPCOL; mycol = rank - myrow * NPCOL; } else { GRID->order = HPL_COLUMN_MAJOR; - mycol = rank / NPROW; myrow = rank - mycol * NPROW; + local_mycol = local_rank / p; local_myrow = local_rank % p; + int noderow = node % (NPROW / p); int nodecol = node / (NPROW / p); + myrow = noderow * p + local_myrow; mycol = nodecol * q + local_mycol; } GRID->iam = rank; GRID->myrow = myrow; GRID->mycol = mycol; GRID->nprow = NPROW; GRID->npcol = NPCOL; GRID->nprocs = nprocs; + GRID->local_myrow = local_myrow; GRID->local_mycol = local_mycol; + GRID->local_nprow = p; GRID->local_npcol = q; /* * row_ip2 : largest power of two <= nprow; * row_hdim : row_ip2 procs hypercube dim; diff --git a/src/pgesv/HPL_pdgesvK2_HIP.cpp b/src/pgesv/HPL_pdgesvK2_HIP.cpp index 3632fc1..ef74cdd 100644 --- a/src/pgesv/HPL_pdgesvK2_HIP.cpp +++ b/src/pgesv/HPL_pdgesvK2_HIP.cpp @@ -286,7 +286,6 @@ void HPL_pdgesvK2_HIP /* if this is process 0,0 and not the first panel */ if(GRID->myrow == 0 && mycol == 0 && j > 0) { time = HPL_ptimer_walltime() - start_time; - gflops = 2.0 * (N * (double)N * N - n * (double)n * n) / 3.0 / (time > 0.0 ? time : 1.e-6) / 1.e9; printf("Column=%09d (%4.1f%%) ", j, j * 100.0 / N); printf("Step Time(s)=%9.7f ", stepEnd-stepStart); @@ -305,6 +304,7 @@ void HPL_pdgesvK2_HIP if (panel[0]->nu1) printf("DGEMM2 Gflops=%9.3e ", largeDgemm1Gflops); else printf("DGEMM2 Gflops=--------- "); + gflops = 2.0 * (A->n * (double)A->n * A->n - (A->n - j) * (double)(A->n - j) * (A->n - j)) / 3.0 / (time > 0.0 ? time : 1.e-6) / 1.e9; printf("Overall Gflops=%9.3e\n", gflops); } #endif diff --git a/testing/backend/HPL_backendHIP.cpp b/testing/backend/HPL_backendHIP.cpp index e51a57f..36b07ee 100644 --- a/testing/backend/HPL_backendHIP.cpp +++ b/testing/backend/HPL_backendHIP.cpp @@ -3,13 +3,16 @@ -void HIP::init(size_t num_gpus) +void HIP::init(const HPL_T_grid* GRID) { int rank, size, count, namelen; size_t bytes; char (*host_names)[MPI_MAX_PROCESSOR_NAME]; + int nprow, npcol, myrow, mycol; + (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol); + MPI_Comm_rank( MPI_COMM_WORLD, &rank ); MPI_Comm_size( MPI_COMM_WORLD, &size ); MPI_Get_processor_name(host_name,&namelen); @@ -23,14 +26,9 @@ void HIP::init(size_t num_gpus) for (int n=0; n < size; n++){ MPI_Bcast(&(host_names[n]),MPI_MAX_PROCESSOR_NAME, MPI_CHAR, n, MPI_COMM_WORLD); } - int localRank = 0; - for (int n = 0; n < rank; n++){ - if (!strcmp(host_name, host_names[n])) localRank++; - } - int localSize = 0; - for (int n = 0; n < size; n++){ - if (!strcmp(host_name, host_names[n])) localSize++; - } + + int localRank = GRID->local_mycol + GRID->local_myrow * GRID->local_npcol; + int localSize = GRID->local_npcol * GRID->local_nprow; hipDeviceProp_t hipDeviceProp; @@ -169,508 +167,122 @@ void HIP::panel_new(HPL_T_grid *GRID, HPL_T_palg *ALGO, const int M, const int N } -void HIP::panel_init(HPL_T_grid *GRID, HPL_T_palg *ALGO, const int M, const int N, const int JB, - HPL_T_pmat *A, const int IA, const int JA, const int TAG, HPL_T_panel *PANEL) + + +void HIP::panel_send_to_host(HPL_T_panel *PANEL) { - size_t dalign; - int icurcol, icurrow, ii, itmp1, jj, lwork, - ml2, mp, mycol, myrow, nb, npcol, nprow, - nq, nu, ldu; - /* .. - * .. Executable Statements .. - */ - PANEL->grid = GRID; /* ptr to the process grid */ - PANEL->algo = ALGO; /* ptr to the algo parameters */ - PANEL->pmat = A; /* ptr to the local array info */ + int jb = PANEL->jb; - myrow = GRID->myrow; - mycol = GRID->mycol; - nprow = GRID->nprow; - npcol = GRID->npcol; - nb = A->nb; + if( ( PANEL->grid->mycol != PANEL->pcol ) || ( jb <= 0 ) ) return; + if(PANEL->mp > 0) + HIP_CHECK_ERROR(hipMemcpy2DAsync(PANEL->A, PANEL->lda*sizeof(double), + PANEL->dA, PANEL->dlda*sizeof(double), + PANEL->mp*sizeof(double), jb, + hipMemcpyDeviceToHost, dataStream)); + HIP_CHECK_ERROR(hipEventRecord(panelCopy, dataStream)); +} - HPL_infog2l(IA, JA, nb, nb, nb, nb, 0, 0, myrow, mycol, - nprow, npcol, &ii, &jj, &icurrow, &icurcol); - mp = HPL_numrocI(M, IA, nb, nb, myrow, 0, nprow); - nq = HPL_numrocI(N, JA, nb, nb, mycol, 0, npcol); +// Only for P=1 +void HPL_unroll_ipiv(const int mp, const int jb, int* ipiv, int* ipiv_ex, int* upiv) +{ + for(int i = 0; i < mp; i++) { upiv[i] = i; } // initialize ids for the swapping + for(int i = 0; i < jb; i++) { // swap ids + int id = upiv[i]; + upiv[i] = upiv[ipiv[i]]; + upiv[ipiv[i]] = id; + } - const int inxtcol = MModAdd1(icurcol, npcol); - const int inxtrow = MModAdd1(icurrow, nprow); + for(int i = 0; i < jb; i++) { ipiv_ex[i] = -1; } - /* ptr to trailing part of A */ - PANEL->A = A->A; - PANEL->dA = Mptr((double *)(A->d_A), ii, jj, A->ld); + int cnt = 0; + for(int i = jb; i < mp; i++) { // find swapped ids outside of panel + if(upiv[i] < jb) { ipiv_ex[upiv[i]] = i; } + } +} - /* - * Workspace pointers are initialized to NULL. - */ - PANEL->L2 = nullptr; - PANEL->dL2 = nullptr; - PANEL->L1 = nullptr; - PANEL->dL1 = nullptr; - PANEL->DINFO = nullptr; - PANEL->U = nullptr; - PANEL->dU = nullptr; - PANEL->W = nullptr; - PANEL->dW = nullptr; - PANEL->U1 = nullptr; - PANEL->dU1 = nullptr; - PANEL->W1 = nullptr; - PANEL->dW1 = nullptr; - PANEL->U2 = nullptr; - PANEL->dU2 = nullptr; - PANEL->W2 = nullptr; - PANEL->dW2 = nullptr; - /* - * Local lengths, indexes process coordinates - */ - PANEL->nb = nb; /* distribution blocking factor */ - PANEL->jb = JB; /* panel width */ - PANEL->m = M; /* global # of rows of trailing part of A */ - PANEL->n = N; /* global # of cols of trailing part of A */ - PANEL->ia = IA; /* global row index of trailing part of A */ - PANEL->ja = JA; /* global col index of trailing part of A */ - PANEL->mp = mp; /* local # of rows of trailing part of A */ - PANEL->nq = nq; /* local # of cols of trailing part of A */ - PANEL->ii = ii; /* local row index of trailing part of A */ - PANEL->jj = jj; /* local col index of trailing part of A */ - PANEL->lda = Mmax(1, mp); /* local leading dim of array A */ - PANEL->dlda = A->ld; /* local leading dim of array A */ - PANEL->prow = icurrow; /* proc row owning 1st row of trailing A */ - PANEL->pcol = icurcol; /* proc col owning 1st col of trailing A */ - PANEL->msgid = TAG; /* message id to be used for panel bcast */ - /* - * Initialize ldl2 and len to temporary dummy values and Update tag for - * next panel - */ - PANEL->ldl2 = 0; /* local leading dim of array L2 */ - PANEL->dldl2 = 0; /* local leading dim of array L2 */ - PANEL->len = 0; /* length of the buffer to broadcast */ - PANEL->nu0 = 0; - PANEL->nu1 = 0; - PANEL->nu2 = 0; - PANEL->ldu0 = 0; - PANEL->ldu1 = 0; - PANEL->ldu2 = 0; +void HIP::panel_send_to_device(HPL_T_panel *PANEL) +{ + double *A, *dA; + int jb, i, ml2; - /*Split fraction*/ - const double fraction = 0.7; + jb = PANEL->jb; - dalign = ALGO->align * sizeof(double); - size_t lpiv = (5 * JB * sizeof(int) + sizeof(double) - 1) / (sizeof(double)); + if (jb <= 0) + return; - if (npcol == 1) /* P x 1 process grid */ - { /* space for L1, DPIV, DINFO */ - lwork = ALGO->align + (PANEL->len = JB * JB + lpiv) + 1; - nu = Mmax(0, nq - JB); - ldu = nu + 256; /*extra space for padding*/ - lwork += JB * ldu; + // only the root column copies to device + if (PANEL->grid->mycol == PANEL->pcol) { - if (PANEL->max_work_size < (size_t)(lwork) * sizeof(double)) - { - if (PANEL->WORK) - { - HIP_CHECK_ERROR(hipFree(PANEL->dWORK)); - HIP_CHECK_ERROR(hipHostFree(PANEL->WORK)); - } - size_t numbytes = (size_t)(lwork) * sizeof(double); + if(PANEL->grid->nprow == 1) { - if (hipMalloc((void **)&(PANEL->dWORK), numbytes) != HIP_SUCCESS || - hipHostMalloc((void **)&(PANEL->WORK), numbytes, hipHostMallocDefault) != HIP_SUCCESS) - { - HPL_pabort(__LINE__, "HPL_pdpanel_init", "Memory allocation failed"); - } - PANEL->max_work_size = (size_t)(lwork) * sizeof(double); + // unroll pivoting and send to device now + int* ipiv = PANEL->ipiv; + int* ipiv_ex = PANEL->ipiv + jb; + int* upiv = PANEL->IWORK + jb; // scratch space -#ifdef HPL_VERBOSE_PRINT - if ((myrow == 0) && (mycol == 0)) - { - printf("Allocating %g GBs of storage on CPU...", - ((double)numbytes) / (1024 * 1024 * 1024)); - fflush(stdout); + for(i = 0; i < jb; i++) { ipiv[i] -= PANEL->ii; } // shift + HPL_unroll_ipiv(PANEL->mp, jb, ipiv, ipiv_ex, upiv); - printf("done.\n"); - printf("Allocating %g GBs of storage on GPU...", - ((double)numbytes) / (1024 * 1024 * 1024)); - fflush(stdout); - printf("done.\n"); - } -#endif - } - /* - * Initialize the pointers of the panel structure - Always re-use A in - * the only process column - */ - PANEL->ldl2 = Mmax(1, mp); - PANEL->dldl2 = A->ld; - PANEL->dL2 = PANEL->dA + (myrow == icurrow ? JB : 0); - PANEL->L2 = PANEL->A + (myrow == icurrow ? JB : 0); - PANEL->U = (double *)PANEL->WORK; - PANEL->dU = (double *)PANEL->dWORK; - PANEL->L1 = (double *)PANEL->WORK + (JB * Mmax(0, ldu)); - PANEL->dL1 = (double *)PANEL->dWORK + (JB * Mmax(0, ldu)); - PANEL->W = A->W; - PANEL->dW = A->dW; + int* dipiv = PANEL->dipiv; + int* dipiv_ex = PANEL->dipiv + jb; - if (nprow == 1) - { - PANEL->nu0 = Mmin(JB, nu); - PANEL->ldu0 = PANEL->nu0; + HIP_CHECK_ERROR(hipMemcpy2DAsync(dipiv, jb * sizeof(int), + upiv, jb * sizeof(int), + jb * sizeof(int), 1, + hipMemcpyHostToDevice, dataStream)); + HIP_CHECK_ERROR(hipMemcpy2DAsync(dipiv_ex, jb * sizeof(int), + ipiv_ex, jb * sizeof(int), + jb * sizeof(int), 1, + hipMemcpyHostToDevice, dataStream)); + } + else { + int k; + int *iflag, *ipl, *ipID, *ipA, *iplen, *ipmap, *ipmapm1, *upiv, *iwork, + *lindxU, *lindxA, *lindxAU, *permU, *permU_ex, *ipiv, + *dlindxU, *dlindxA, *dlindxAU, *dpermU, *dpermU_ex, *dipiv; - PANEL->nu1 = 0; - PANEL->ldu1 = 0; + k = (int)((unsigned int)(jb) << 1); + iflag = PANEL->IWORK; ipl = iflag + 1; ipID = ipl + 1; ipA = ipID + ((unsigned int)(k) << 1); + iplen = ipA + 1; ipmap = iplen + PANEL->grid->nprow + 1; ipmapm1 = ipmap + PANEL->grid->nprow; + upiv = ipmapm1 + PANEL->grid->nprow; iwork = upiv + PANEL->mp; - PANEL->nu2 = nu - PANEL->nu0; - PANEL->ldu2 = ((PANEL->nu2 + 95) / 128) * 128 + 32; /*pad*/ + lindxU = PANEL->lindxU; lindxA = PANEL->lindxA; lindxAU = PANEL->lindxAU; + permU = PANEL->permU; permU_ex = permU + jb; ipiv = PANEL->ipiv; + dlindxU = PANEL->dlindxU; dlindxA = PANEL->dlindxA; dlindxAU = PANEL->dlindxAU; + dpermU = PANEL->dpermU; dpermU_ex = dpermU + jb; dipiv = PANEL->dipiv; - PANEL->U1 = PANEL->U + PANEL->ldu0 * JB; - PANEL->dU1 = PANEL->dU + PANEL->ldu0 * JB; - PANEL->U2 = PANEL->U1 + PANEL->ldu1 * JB; - PANEL->dU2 = PANEL->dU1 + PANEL->ldu1 * JB; + if(*iflag == -1) { /* no index arrays have been computed so far */ + HPL_pipid(PANEL, ipl, ipID); + HPL_plindx(PANEL, *ipl, ipID, ipA, lindxU, lindxAU, lindxA, iplen, permU, iwork); + *iflag = 1; + } - PANEL->permU = (int *)(PANEL->L1 + JB * JB); - PANEL->dpermU = (int *)(PANEL->dL1 + JB * JB); - PANEL->ipiv = PANEL->permU + JB; - PANEL->dipiv = PANEL->dpermU + JB; + int N = Mmax(*ipA, jb); + if(N > 0) { + HIP_CHECK_ERROR(hipMemcpy2DAsync(dlindxA, k * sizeof(int), lindxA, k * sizeof(int), N * sizeof(int), 1, hipMemcpyHostToDevice, dataStream)); + HIP_CHECK_ERROR(hipMemcpy2DAsync(dlindxAU, k * sizeof(int), lindxAU, k * sizeof(int), N * sizeof(int), 1, hipMemcpyHostToDevice, dataStream)); + } - PANEL->DINFO = (double *)(PANEL->ipiv + 2 * JB); - PANEL->dDINFO = (double *)(PANEL->dipiv + 2 * JB); + HIP_CHECK_ERROR(hipMemcpyAsync(dlindxU, lindxU, jb * sizeof(int), hipMemcpyHostToDevice, dataStream)); + + HIP_CHECK_ERROR(hipMemcpy2DAsync(dpermU, jb * sizeof(int), permU, jb * sizeof(int), jb * sizeof(int), 1, hipMemcpyHostToDevice, dataStream)); + HIP_CHECK_ERROR(hipMemcpy2DAsync(dipiv, jb * sizeof(int), ipiv, jb * sizeof(int), jb * sizeof(int), 1, hipMemcpyHostToDevice, dataStream)); } - else - { - const int NSplit = Mmax(0, ((((int)(A->nq * fraction)) / nb) * nb)); - PANEL->nu0 = Mmin(JB, nu); - PANEL->ldu0 = PANEL->nu0; + } - PANEL->nu2 = Mmin(nu - PANEL->nu0, NSplit); - PANEL->ldu2 = ((PANEL->nu2 + 95) / 128) * 128 + 32; /*pad*/ + //record when the swap data will arrive + HIP_CHECK_ERROR(hipEventRecord(swapDataTransfer, dataStream)); - PANEL->nu1 = nu - PANEL->nu0 - PANEL->nu2; - PANEL->ldu1 = ((PANEL->nu1 + 95) / 128) * 128 + 32; /*pad*/ + // copy A and/or L2 + if(PANEL->grid->mycol == PANEL->pcol) { + // copy L1 + HIP_CHECK_ERROR(hipMemcpy2DAsync(PANEL->dL1, jb * sizeof(double), + PANEL->L1, jb * sizeof(double), + jb * sizeof(double), jb, + hipMemcpyHostToDevice, dataStream)); - PANEL->U1 = PANEL->U + PANEL->ldu0 * JB; - PANEL->dU1 = PANEL->dU + PANEL->ldu0 * JB; - PANEL->U2 = PANEL->U1 + PANEL->ldu1 * JB; - PANEL->dU2 = PANEL->dU1 + PANEL->ldu1 * JB; - - PANEL->W1 = PANEL->W + PANEL->ldu0 * JB; - PANEL->dW1 = PANEL->dW + PANEL->ldu0 * JB; - PANEL->W2 = PANEL->W1 + PANEL->ldu1 * JB; - PANEL->dW2 = PANEL->dW1 + PANEL->ldu1 * JB; - - PANEL->lindxA = (int *)(PANEL->L1 + JB * JB); - PANEL->dlindxA = (int *)(PANEL->dL1 + JB * JB); - PANEL->lindxAU = PANEL->lindxA + JB; - PANEL->dlindxAU = PANEL->dlindxA + JB; - PANEL->lindxU = PANEL->lindxAU + JB; - PANEL->dlindxU = PANEL->dlindxAU + JB; - PANEL->permU = PANEL->lindxU + JB; - PANEL->dpermU = PANEL->dlindxU + JB; - - // Put ipiv array at the end - PANEL->dipiv = PANEL->dpermU + JB; - PANEL->ipiv = PANEL->permU + JB; - - PANEL->DINFO = ((double *)PANEL->lindxA) + lpiv; - PANEL->dDINFO = ((double *)PANEL->dlindxA) + lpiv; - } - - *(PANEL->DINFO) = 0.0; - } - else // for ncol != 1 - { /* space for L2, L1, DPIV */ - ml2 = (myrow == icurrow ? mp - JB : mp); - ml2 = Mmax(0, ml2); - ml2 = ((ml2 + 95) / 128) * 128 + 32; /*pad*/ - itmp1 = JB * JB + lpiv; // L1, integer arrays - PANEL->len = ml2 * JB + itmp1; - - lwork = ALGO->align + PANEL->len + 1; - - nu = Mmax(0, (mycol == icurcol ? nq - JB : nq)); - ldu = nu + 256; /*extra space for potential padding*/ - - // if( nprow > 1 ) /* space for U */ - { - lwork += JB * ldu; - } - if (PANEL->max_work_size < (size_t)(lwork) * sizeof(double)) - { - if (PANEL->WORK) - { - HIP_CHECK_ERROR(hipFree(PANEL->dWORK)); - HIP_CHECK_ERROR(hipHostFree(PANEL->WORK)); - } - size_t numbytes = (size_t)(lwork) * sizeof(double); - - if (hipMalloc((void **)&(PANEL->dWORK), numbytes) != HIP_SUCCESS || - hipHostMalloc((void **)&(PANEL->WORK), numbytes, hipHostMallocDefault) != HIP_SUCCESS) - { - HPL_pabort(__LINE__, "HPL_pdpanel_init", "Memory allocation failed"); - } - PANEL->max_work_size = (size_t)(lwork) * sizeof(double); -#ifdef HPL_VERBOSE_PRINT - if ((myrow == 0) && (mycol == 0)) - { - printf("Allocating %g GBs of storage on CPU...", - ((double)numbytes) / (1024 * 1024 * 1024)); - fflush(stdout); - printf("done.\n"); - printf("Allocating %g GBs of storage on GPU...", - ((double)numbytes) / (1024 * 1024 * 1024)); - fflush(stdout); - printf("done.\n"); - } -#endif - } - /* - * Initialize the pointers of the panel structure - Re-use A in the cur- - * rent process column when HPL_COPY_L is not defined. - */ - PANEL->U = (double *)PANEL->WORK; - PANEL->dU = (double *)PANEL->dWORK; - - PANEL->W = A->W; - PANEL->dW = A->dW; - - PANEL->L2 = (double *)PANEL->WORK + (JB * Mmax(0, ldu)); - PANEL->dL2 = (double *)PANEL->dWORK + (JB * Mmax(0, ldu)); - PANEL->L1 = PANEL->L2 + ml2 * JB; - PANEL->dL1 = PANEL->dL2 + ml2 * JB; - PANEL->ldl2 = Mmax(1, ml2); - PANEL->dldl2 = Mmax(1, ml2); - - if (nprow == 1) - { - PANEL->nu0 = (mycol == inxtcol) ? Mmin(JB, nu) : 0; - PANEL->ldu0 = PANEL->nu0; - - PANEL->nu1 = 0; - PANEL->ldu1 = 0; - - PANEL->nu2 = nu - PANEL->nu0; - PANEL->ldu2 = ((PANEL->nu2 + 95) / 128) * 128 + 32; /*pad*/ - - PANEL->U1 = PANEL->U + PANEL->ldu0 * JB; - PANEL->dU1 = PANEL->dU + PANEL->ldu0 * JB; - PANEL->U2 = PANEL->U1 + PANEL->ldu1 * JB; - PANEL->dU2 = PANEL->dU1 + PANEL->ldu1 * JB; - - PANEL->permU = (int *)(PANEL->L1 + JB * JB); - PANEL->dpermU = (int *)(PANEL->dL1 + JB * JB); - PANEL->ipiv = PANEL->permU + JB; - PANEL->dipiv = PANEL->dpermU + JB; - - PANEL->DINFO = (double *)(PANEL->ipiv + 2 * JB); - PANEL->dDINFO = (double *)(PANEL->dipiv + 2 * JB); - } - else - { - const int NSplit = Mmax(0, ((((int)(A->nq * fraction)) / nb) * nb)); - PANEL->nu0 = (mycol == inxtcol) ? Mmin(JB, nu) : 0; - PANEL->ldu0 = PANEL->nu0; - - PANEL->nu2 = Mmin(nu - PANEL->nu0, NSplit); - PANEL->ldu2 = ((PANEL->nu2 + 95) / 128) * 128 + 32; /*pad*/ - - PANEL->nu1 = nu - PANEL->nu0 - PANEL->nu2; - PANEL->ldu1 = ((PANEL->nu1 + 95) / 128) * 128 + 32; /*pad*/ - - PANEL->U1 = PANEL->U + PANEL->ldu0 * JB; - PANEL->dU1 = PANEL->dU + PANEL->ldu0 * JB; - PANEL->U2 = PANEL->U1 + PANEL->ldu1 * JB; - PANEL->dU2 = PANEL->dU1 + PANEL->ldu1 * JB; - - PANEL->W1 = PANEL->W + PANEL->ldu0 * JB; - PANEL->dW1 = PANEL->dW + PANEL->ldu0 * JB; - PANEL->W2 = PANEL->W1 + PANEL->ldu1 * JB; - PANEL->dW2 = PANEL->dW1 + PANEL->ldu1 * JB; - - PANEL->lindxA = (int *)(PANEL->L1 + JB * JB); - PANEL->dlindxA = (int *)(PANEL->dL1 + JB * JB); - PANEL->lindxAU = PANEL->lindxA + JB; - PANEL->dlindxAU = PANEL->dlindxA + JB; - PANEL->lindxU = PANEL->lindxAU + JB; - PANEL->dlindxU = PANEL->dlindxAU + JB; - PANEL->permU = PANEL->lindxU + JB; - PANEL->dpermU = PANEL->dlindxU + JB; - - // Put ipiv array at the end - PANEL->ipiv = PANEL->permU + JB; - PANEL->dipiv = PANEL->dpermU + JB; - - PANEL->DINFO = ((double *)PANEL->lindxA) + lpiv; - PANEL->dDINFO = ((double *)PANEL->dlindxA) + lpiv; - } - - *(PANEL->DINFO) = 0.0; - } - - if (nprow == 1) - { - lwork = mp + JB; - } - else - { - itmp1 = (JB << 1); - lwork = nprow + 1; - itmp1 = Mmax(itmp1, lwork); - lwork = mp + 4 + (5 * JB) + (3 * nprow) + itmp1; - } - - if (PANEL->max_iwork_size < (size_t)(lwork) * sizeof(int)) - { - if (PANEL->IWORK) - { - std::free(PANEL->IWORK); - } - size_t numbytes = (size_t)(lwork) * sizeof(int); - PANEL->IWORK = (int *)std::malloc(numbytes); - if (PANEL->IWORK == NULL) - { - HPL_pabort(__LINE__, "HPL_pdpanel_init", "Panel Host Integer Memory allocation failed"); - } - PANEL->max_iwork_size = (size_t)(lwork) * sizeof(int); - } - if (lwork) - *(PANEL->IWORK) = -1; - - /* ensure the temp buffer in HPL_pdfact is allocated once*/ - lwork = (size_t)(((4 + ((unsigned int)(JB) << 1)) << 1)); - if (PANEL->max_fwork_size < (size_t)(lwork) * sizeof(double)) - { - if (PANEL->fWORK) - { - HIP_CHECK_ERROR(hipHostFree(PANEL->fWORK)); - } - size_t numbytes = (size_t)(lwork) * sizeof(double); - - HIP_CHECK_ERROR(hipHostMalloc((void **)&PANEL->fWORK, numbytes)); - if (PANEL->fWORK == NULL) - { - HPL_pabort(__LINE__, "HPL_pdpanel_init", "Panel Host pdfact Scratch Memory allocation failed"); - } - PANEL->max_fwork_size = (size_t)(lwork) * sizeof(double); - } - /* - * End of HPL_pdpanel_init - */ -} - -void HIP::panel_send_to_host(HPL_T_panel *PANEL) -{ - int jb = PANEL->jb; - - if( ( PANEL->grid->mycol != PANEL->pcol ) || ( jb <= 0 ) ) return; - if(PANEL->mp > 0) - HIP_CHECK_ERROR(hipMemcpy2DAsync(PANEL->A, PANEL->lda*sizeof(double), - PANEL->dA, PANEL->dlda*sizeof(double), - PANEL->mp*sizeof(double), jb, - hipMemcpyDeviceToHost, dataStream)); - HIP_CHECK_ERROR(hipEventRecord(panelCopy, dataStream)); -} - -// Only for P=1 -void HPL_unroll_ipiv(const int mp, const int jb, int* ipiv, int* ipiv_ex, int* upiv) -{ - for(int i = 0; i < mp; i++) { upiv[i] = i; } // initialize ids for the swapping - for(int i = 0; i < jb; i++) { // swap ids - int id = upiv[i]; - upiv[i] = upiv[ipiv[i]]; - upiv[ipiv[i]] = id; - } - - for(int i = 0; i < jb; i++) { ipiv_ex[i] = -1; } - - int cnt = 0; - for(int i = jb; i < mp; i++) { // find swapped ids outside of panel - if(upiv[i] < jb) { ipiv_ex[upiv[i]] = i; } - } -} - -void HIP::panel_send_to_device(HPL_T_panel *PANEL) -{ - double *A, *dA; - int jb, i, ml2; - - jb = PANEL->jb; - - if (jb <= 0) - return; - - // only the root column copies to device - if (PANEL->grid->mycol == PANEL->pcol) { - - if(PANEL->grid->nprow == 1) { - - // unroll pivoting and send to device now - int* ipiv = PANEL->ipiv; - int* ipiv_ex = PANEL->ipiv + jb; - int* upiv = PANEL->IWORK + jb; // scratch space - - for(i = 0; i < jb; i++) { ipiv[i] -= PANEL->ii; } // shift - HPL_unroll_ipiv(PANEL->mp, jb, ipiv, ipiv_ex, upiv); - - int* dipiv = PANEL->dipiv; - int* dipiv_ex = PANEL->dipiv + jb; - - HIP_CHECK_ERROR(hipMemcpy2DAsync(dipiv, jb * sizeof(int), - upiv, jb * sizeof(int), - jb * sizeof(int), 1, - hipMemcpyHostToDevice, dataStream)); - HIP_CHECK_ERROR(hipMemcpy2DAsync(dipiv_ex, jb * sizeof(int), - ipiv_ex, jb * sizeof(int), - jb * sizeof(int), 1, - hipMemcpyHostToDevice, dataStream)); - } - else { - int k; - int *iflag, *ipl, *ipID, *ipA, *iplen, *ipmap, *ipmapm1, *upiv, *iwork, - *lindxU, *lindxA, *lindxAU, *permU, *permU_ex, *ipiv, - *dlindxU, *dlindxA, *dlindxAU, *dpermU, *dpermU_ex, *dipiv; - - k = (int)((unsigned int)(jb) << 1); - iflag = PANEL->IWORK; ipl = iflag + 1; ipID = ipl + 1; ipA = ipID + ((unsigned int)(k) << 1); - iplen = ipA + 1; ipmap = iplen + PANEL->grid->nprow + 1; ipmapm1 = ipmap + PANEL->grid->nprow; - upiv = ipmapm1 + PANEL->grid->nprow; iwork = upiv + PANEL->mp; - - lindxU = PANEL->lindxU; lindxA = PANEL->lindxA; lindxAU = PANEL->lindxAU; - permU = PANEL->permU; permU_ex = permU + jb; ipiv = PANEL->ipiv; - dlindxU = PANEL->dlindxU; dlindxA = PANEL->dlindxA; dlindxAU = PANEL->dlindxAU; - dpermU = PANEL->dpermU; dpermU_ex = dpermU + jb; dipiv = PANEL->dipiv; - - if(*iflag == -1) { /* no index arrays have been computed so far */ - HPL_pipid(PANEL, ipl, ipID); - HPL_plindx(PANEL, *ipl, ipID, ipA, lindxU, lindxAU, lindxA, iplen, permU, iwork); - *iflag = 1; - } - - int N = Mmax(*ipA, jb); - if(N > 0) { - HIP_CHECK_ERROR(hipMemcpy2DAsync(dlindxA, k * sizeof(int), lindxA, k * sizeof(int), N * sizeof(int), 1, hipMemcpyHostToDevice, dataStream)); - HIP_CHECK_ERROR(hipMemcpy2DAsync(dlindxAU, k * sizeof(int), lindxAU, k * sizeof(int), N * sizeof(int), 1, hipMemcpyHostToDevice, dataStream)); - } - - HIP_CHECK_ERROR(hipMemcpyAsync(dlindxU, lindxU, jb * sizeof(int), hipMemcpyHostToDevice, dataStream)); - - HIP_CHECK_ERROR(hipMemcpy2DAsync(dpermU, jb * sizeof(int), permU, jb * sizeof(int), jb * sizeof(int), 1, hipMemcpyHostToDevice, dataStream)); - HIP_CHECK_ERROR(hipMemcpy2DAsync(dipiv, jb * sizeof(int), ipiv, jb * sizeof(int), jb * sizeof(int), 1, hipMemcpyHostToDevice, dataStream)); - } - } - - //record when the swap data will arrive - HIP_CHECK_ERROR(hipEventRecord(swapDataTransfer, dataStream)); - - // copy A and/or L2 - if(PANEL->grid->mycol == PANEL->pcol) { - // copy L1 - HIP_CHECK_ERROR(hipMemcpy2DAsync(PANEL->dL1, jb * sizeof(double), - PANEL->L1, jb * sizeof(double), - jb * sizeof(double), jb, - hipMemcpyHostToDevice, dataStream)); - - //record when L1 will arrive - HIP_CHECK_ERROR(hipEventRecord(L1Transfer, dataStream)); + //record when L1 will arrive + HIP_CHECK_ERROR(hipEventRecord(L1Transfer, dataStream)); if(PANEL->grid->npcol > 1) { // L2 is its own array if(PANEL->grid->myrow == PANEL->prow) { @@ -723,16 +335,6 @@ int HIP::panel_free(HPL_T_panel *PANEL) return HPL_SUCCESS; } -int HIP::panel_disp(HPL_T_panel **PANEL) -{ - GPUInfo("%-40s \t%-5s", "[Deallocate]", "Panel structure", "HIP"); - (*PANEL)->free_work_now = 1; - int err = HIP::panel_free(*PANEL); - if (*PANEL) free(*PANEL); - *PANEL = NULL; - return( err ); -} - void HIP::gPrintMat(const int M, const int N, const int LDA, const double *A) { // Last row is the vector b @@ -751,7 +353,7 @@ int HIP::pdmatgen(HPL_T_test* TEST, HPL_T_grid* GRID, HPL_T_palg* ALGO, HPL_T_p int mycol, myrow, npcol, nprow, nq, info[3]; (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol); - mat->n = N; mat->nb = NB; mat->info = 0; + mat->n = N; mat->nb = NB; mat->info = 0; mat->dN = N; mat->mp = HPL_numroc(N, NB, NB, myrow, 0, nprow); nq = HPL_numroc(N, NB, NB, mycol, 0, npcol); /* @@ -764,16 +366,8 @@ int HIP::pdmatgen(HPL_T_test* TEST, HPL_T_grid* GRID, HPL_T_palg* ALGO, HPL_T_p * Ensure that lda is a multiple of ALIGN and not a power of 2, and not * a multiple of 4096 bytes */ - mat->ld = ((Mmax(1, mat->mp) - 1) / ALGO->align) * ALGO->align; - do { - ii = (mat->ld += ALGO->align); - ip2 = 1; - while(ii > 1) { - ii >>= 1; - ip2 <<= 1; - } - im4096 = (mat->ld % 512 ) ? 0 : 1; - } while((mat->ld == ip2) || im4096); + mat->ld = Mmax(1, mat->mp); + mat->ld = ((mat->ld + 95) / 128) * 128 + 32; /*pad*/ mat->nq = nq + 1; @@ -861,8 +455,8 @@ int HIP::pdmatgen(HPL_T_test* TEST, HPL_T_grid* GRID, HPL_T_palg* ALGO, HPL_T_p workspace_size = Mmax((2 * Anp + nq) * sizeof(double), workspace_size); /*Scratch space for rows in pdlaswp (with extra space for padding) */ - dworkspace_size = Mmax((nq+256) * mat->nb * sizeof(double), dworkspace_size); - workspace_size = Mmax((nq+256) * mat->nb * sizeof(double), workspace_size); + dworkspace_size = Mmax((nq + mat->nb + 256) * mat->nb * sizeof(double), dworkspace_size); + workspace_size = Mmax((nq + mat->nb + 256) * mat->nb * sizeof(double), workspace_size); #ifdef HPL_VERBOSE_PRINT if((myrow == 0) && (mycol == 0)) { @@ -919,6 +513,18 @@ void HIP::pdmatfree(HPL_T_pmat* mat) { } +int HIP::panel_disp(HPL_T_panel **PANEL) +{ + GPUInfo("%-40s \t%-5s", "[Deallocate]", "Panel structure", "HIP"); + (*PANEL)->pmat->n = (*PANEL)->pmat->dN; + (*PANEL)->free_work_now = 1; + int err = HIP::panel_free(*PANEL); + if (*PANEL) free(*PANEL); + *PANEL = NULL; + return( err ); +} + + void HIP::matgen(const HPL_T_grid *GRID, const int M, const int N, const int NB, double *A, const int LDA, const int ISEED) @@ -1278,47 +884,176 @@ void HIP::move_data(double *DST, const double *SRC, const size_t SIZE, const int HIP_CHECK_ERROR(hipMemcpy(DST, SRC, SIZE, (hipMemcpyKind)KIND)); } -void HIP::move_data_2d(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, const int KIND) -{ - char title[25] = "[MOVE2D_"; strcat(title,_memcpyKind[KIND]); strcat(title,"]"); - GPUInfo("%-25s %-12ld (B) \t%-5s", title, "Memory of size", SIZE, "HIP"); - HIP_CHECK_ERROR(hipMemcpy2D(dst, dpitch, src, spitch, width, height, (hipMemcpyKind)KIND)); -} +void HIP::move_data_2d(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, const int KIND) +{ + char title[25] = "[MOVE2D_"; strcat(title,_memcpyKind[KIND]); strcat(title,"]"); + GPUInfo("%-25s %-12ld (B) \t%-5s", title, "Memory of size", SIZE, "HIP"); + HIP_CHECK_ERROR(hipMemcpy2D(dst, dpitch, src, spitch, width, height, (hipMemcpyKind)KIND)); +} + + +void HIP::device_sync() { + HIP_CHECK_ERROR(hipDeviceSynchronize()); +} + +int HIP::bcast_ibcst(HPL_T_panel* PANEL, int* IFLAG) { + + double *L2ptr; +#ifdef ROCM + L2ptr = PANEL->dL2; +#else + L2ptr = PANEL->L2; +#endif + + if(PANEL == NULL) { + return HPL_SUCCESS; + } + if(PANEL->grid->npcol <= 1) { + return HPL_SUCCESS; + } + + MPI_Comm comm = PANEL->grid->row_comm; + int root = PANEL->pcol; + + if(PANEL->len <= 0) return HPL_SUCCESS; + int ierr = MPI_Ibcast(L2ptr, PANEL->len, MPI_DOUBLE, root, comm, &bcast_req); + *IFLAG = ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE)); + return *IFLAG; +} + +int HIP::bwait_ibcast(HPL_T_panel* PANEL) { + int ierr; + ierr = MPI_Wait(&bcast_req, MPI_STATUS_IGNORE); + return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE)); +} + +void HIP::HPL_pdlaswp_hip(HPL_T_panel* PANEL, const HPL_T_UPD UPD, const SWP_PHASE phase) { + double *U, *W; + double *dA, *dU, *dW; + int *ipID, *iplen, *ipcounts, *ipoffsets, *iwork, *lindxU = NULL, *lindxA = NULL, *lindxAU, *permU; + int *dlindxU = NULL, *dlindxA = NULL, *dlindxAU, *dpermU, *dpermU_ex; + int icurrow, *iflag, *ipA, *ipl, jb, k, lda, myrow, n, nprow, LDU, LDW; + MPI_Comm comm; + + /* + * Retrieve parameters from the PANEL data structure + */ + n = PANEL->n; jb = PANEL->jb; + nprow = PANEL->grid->nprow; myrow = PANEL->grid->myrow; + comm = PANEL->grid->col_comm; icurrow = PANEL->prow; + iflag = PANEL->IWORK; + dA = PANEL->dA; lda = PANEL->dlda; + PANEL->pmat->n = PANEL->dldl1; + + // Quick return if we're 1xQ + if(phase != SWP_END && nprow == 1) return; + + pdlaswp_set_var(PANEL, dU, U, LDU, dW, W, LDW, n, dA, UPD); + + /* Quick return if there is nothing to do */ + if((n <= 0) || (jb <= 0)) return; + + // Quick swapping if P==1 + if (phase == SWP_END && nprow == 1) { + // wait for swapping data to arrive + HPL_BE_stream_wait_event(HPL_COMPUTESTREAM, SWAPDATATRANSFER, HPL_TR); + HIP::HPL_dlaswp00N(jb, n, dA, lda, PANEL->dipiv); + return; + } -void HIP::device_sync() { - HIP_CHECK_ERROR(hipDeviceSynchronize()); -} + /* + * Compute ipID (if not already done for this panel). lindxA and lindxAU + * are of length at most 2*jb - iplen is of size nprow+1, ipmap, ipmapm1 + * are of size nprow, permU is of length jb, and this function needs a + * workspace of size max( 2 * jb (plindx1), nprow+1(equil)): + * 1(iflag) + 1(ipl) + 1(ipA) + 9*jb + 3*nprow + 1 + MAX(2*jb,nprow+1) + * i.e. 4 + 9*jb + 3*nprow + max(2*jb, nprow+1); + */ + k = (int)((unsigned int)(jb) << 1); + ipl = iflag + 1; + ipID = ipl + 1; + ipA = ipID + ((unsigned int)(k) << 1); + iplen = ipA + 1; + ipcounts = iplen + nprow + 1; + ipoffsets = ipcounts + nprow; + iwork = ipoffsets + nprow; -int HIP::bcast_ibcst(HPL_T_panel* PANEL, int* IFLAG) { + if (phase == SWP_START) { + if(*iflag == -1) {/* no index arrays have been computed so far */ + // get the ipivs on the host after the Bcast + if(PANEL->grid->mycol != PANEL->pcol) { + HIP_CHECK_ERROR(hipMemcpy2DAsync(PANEL->ipiv, PANEL->jb * sizeof(int), + PANEL->dipiv, PANEL->jb * sizeof(int), + PANEL->jb * sizeof(int), 1, + hipMemcpyDeviceToHost, HIP::dataStream)); + } + HPL_BE_stream_synchronize(HPL_DATASTREAM, HPL_TR); - double *L2ptr; -#ifdef ROCM - L2ptr = PANEL->dL2; -#else - L2ptr = PANEL->L2; -#endif + // compute spreading info + HPL_pipid(PANEL, ipl, ipID); + HPL_plindx(PANEL, *ipl, ipID, ipA, PANEL->lindxU, PANEL->lindxAU, PANEL->lindxA, iplen, PANEL->permU, iwork); + *iflag = 1; + } - if(PANEL == NULL) { - return HPL_SUCCESS; - } - if(PANEL->grid->npcol <= 1) { - return HPL_SUCCESS; + /* + * For i in [0..2*jb), lindxA[i] is the offset in A of a row that ulti- + * mately goes to U( :, lindxAU[i] ). In each rank, we directly pack + * into U, otherwise we pack into workspace. The first + * entry of each column packed in workspace is in fact the row or column + * offset in U where it should go to. + */ + if(myrow == icurrow) { + // copy needed rows of A into U + HIP::HPL_dlaswp01T(jb, n, dA, lda, dU, LDU, PANEL->dlindxU); + // record the evernt when packing completes + HIP::event_record(SWAPSTART, UPD); + } else { + // copy needed rows from A into U(:, iplen[myrow]) + HIP::HPL_dlaswp03T(iplen[myrow + 1] - iplen[myrow], n, dA, lda, Mptr(dU, 0, iplen[myrow], LDU), LDU, PANEL->dlindxU); + // record the event when packing completes + HIP::event_record(SWAPSTART, UPD); + } } + else if (phase == SWP_COMM) { + /* Set MPI message counts and offsets */ + ipcounts[0] = (iplen[1] - iplen[0]) * LDU; + ipoffsets[0] = 0; + PANEL->pmat->n = PANEL->dldl1; + // if (phase == SWP_END) + // PANEL->pmat->n = PANEL->pmat->dN; + for(int i = 1; i < nprow; ++i) { + ipcounts[i] = (iplen[i + 1] - iplen[i]) * LDU; + ipoffsets[i] = ipcounts[i - 1] + ipoffsets[i - 1]; + } - MPI_Comm comm = PANEL->grid->row_comm; - int root = PANEL->pcol; - - if(PANEL->len <= 0) return HPL_SUCCESS; - int ierr = MPI_Ibcast(L2ptr, PANEL->len, MPI_DOUBLE, root, comm, &bcast_req); - *IFLAG = ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE)); - return *IFLAG; -} + if(myrow == icurrow) { + HIP::event_synchronize(SWAPSTART, UPD); + // Send rows info to other ranks + HPL_scatterv(dU, ipcounts, ipoffsets, ipcounts[myrow], icurrow, comm); + // All gather dU (gather + broadcast) + HPL_allgatherv(dU, ipcounts[myrow], ipcounts, ipoffsets, comm); + } else { + // Wait for dU to be ready + HIP::event_synchronize(SWAPSTART, UPD); + // Receive rows from icurrow into dW + HPL_scatterv(dW, ipcounts, ipoffsets, ipcounts[myrow], icurrow, comm); + // All gather dU + HPL_allgatherv(dU, ipcounts[myrow], ipcounts, ipoffsets, comm); + } + } + else if (phase == SWP_END) { + if(myrow == icurrow) { + // Swap rows local to A on device + HIP::HPL_dlaswp02T(*ipA, n, dA, lda, PANEL->dlindxAU, PANEL->dlindxA); + } else { + // Queue inserting recieved rows in W into A on device + HIP::HPL_dlaswp04T(iplen[myrow + 1] - iplen[myrow], n, dA, lda, dW, LDW, PANEL->dlindxU); + } + /* Permute U in every process row */ + HIP::HPL_dlaswp10N(n, jb, dU, LDU, PANEL->dpermU); -int HIP::bwait_ibcast(HPL_T_panel* PANEL) { - int ierr; - ierr = MPI_Wait(&bcast_req, MPI_STATUS_IGNORE); - return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE)); + } } #define BLOCK_SIZE_PDLANGE 512 @@ -1558,409 +1293,681 @@ __global__ void _dlaswp00N(const int N, const int M, double* __restrict__ A, con } } -// Row swapping for P==1 -void HIP::HPL_dlaswp00N(const int M, const int N, double* A, const int LDA, const int* IPIV) { +// Row swapping for P==1 +void HIP::HPL_dlaswp00N(const int M, const int N, double* A, const int LDA, const int* IPIV) { + + if((M <= 0) || (N <= 0)) return; + + hipStream_t stream; + ROCBLAS_CHECK_STATUS(rocblas_get_stream(_handle, &stream)); + int grid_size = N; + hipLaunchKernelGGL((_dlaswp00N), dim3(grid_size), dim3(BLOCK_SIZE_00N), 0, stream, N, M, A, LDA, IPIV); +} + + +#define TILE_DIM_01T 32 +#define BLOCK_ROWS_01T 8 + +/* Build U matrix from rows of A */ +__global__ void _dlaswp01T(const int M, const int N, double* __restrict__ A, const int LDA, double* __restrict__ U, const int LDU, const int* __restrict__ LINDXU) { + + __shared__ double s_U[TILE_DIM_01T][TILE_DIM_01T + 1]; + + const int m = threadIdx.x + TILE_DIM_01T * blockIdx.x; + const int n = threadIdx.y + TILE_DIM_01T * blockIdx.y; + + if(m < M) { + const int ipa = LINDXU[m]; + + // Save to LDS to reduce global memory operation + s_U[threadIdx.x][threadIdx.y + 0] = + (n + 0 < N) ? A[ipa + (n + 0) * ((size_t)LDA)] : 0.0; + s_U[threadIdx.x][threadIdx.y + 8] = + (n + 8 < N) ? A[ipa + (n + 8) * ((size_t)LDA)] : 0.0; + s_U[threadIdx.x][threadIdx.y + 16] = + (n + 16 < N) ? A[ipa + (n + 16) * ((size_t)LDA)] : 0.0; + s_U[threadIdx.x][threadIdx.y + 24] = + (n + 24 < N) ? A[ipa + (n + 24) * ((size_t)LDA)] : 0.0; + } + + __syncthreads(); + + const int um = threadIdx.y + TILE_DIM_01T * blockIdx.x; + const int un = threadIdx.x + TILE_DIM_01T * blockIdx.y; + + if(un < N) { + // write out chunks of U + if((um + 0) < M) + U[un + (um + 0) * ((size_t)LDU)] = s_U[threadIdx.y + 0][threadIdx.x]; + if((um + 8) < M) + U[un + (um + 8) * ((size_t)LDU)] = s_U[threadIdx.y + 8][threadIdx.x]; + if((um + 16) < M) + U[un + (um + 16) * ((size_t)LDU)] = s_U[threadIdx.y + 16][threadIdx.x]; + if((um + 24) < M) + U[un + (um + 24) * ((size_t)LDU)] = s_U[threadIdx.y + 24][threadIdx.x]; + } +} + +void HIP::HPL_dlaswp01T(const int M, const int N, double* A, const int LDA, double* U, const int LDU, const int* LINDXU) { + + if((M <= 0) || (N <= 0)) return; + + hipStream_t stream; + ROCBLAS_CHECK_STATUS(rocblas_get_stream(_handle, &stream)); + dim3 grid_size((M + TILE_DIM_01T - 1) / TILE_DIM_01T, (N + TILE_DIM_01T - 1) / TILE_DIM_01T); + dim3 block_size(TILE_DIM_01T, BLOCK_ROWS_01T); + hipLaunchKernelGGL((_dlaswp01T), grid_size, block_size, 0, stream, M, N, A, LDA, U, LDU, LINDXU); +} + +/* Perform any local row swaps of A */ +__global__ void _dlaswp02T(const int M, const int N, double* __restrict__ A, const int LDA, const int* __restrict__ LINDXAU, const int* __restrict__ LINDXA) { + + const int n = blockIdx.x, m = threadIdx.x; + + const int srow = LINDXAU[m]; //src row + const int drow = LINDXA[m]; //dst row + + const double An = A[srow + n * ((size_t)LDA)]; + + __syncthreads(); + + A[drow + n * ((size_t)LDA)] = An; +} + +void HIP::HPL_dlaswp02T(const int M, const int N, double* A, const int LDA, const int* LINDXAU, const int* LINDXA) { + + if((M <= 0) || (N <= 0)) return; + + hipStream_t stream; + ROCBLAS_CHECK_STATUS(rocblas_get_stream(_handle, &stream)); + dim3 grid_size(N), block_size(M); + hipLaunchKernelGGL((_dlaswp02T), N, M, 0, stream, M, N, A, LDA, LINDXAU, LINDXA); +} + +#define TILE_DIM_03T 32 +#define BLOCK_ROWS_03T 8 + +/* Build W matrix from rows of A */ +__global__ void _dlaswp03T(const int M, const int N, double* __restrict__ A, const int LDA, double* __restrict__ W, const int LDW, const int* __restrict__ LINDXU) { + + __shared__ double s_W[TILE_DIM_03T][TILE_DIM_03T + 1]; + + const int m = threadIdx.x + TILE_DIM_03T * blockIdx.x; + const int n = threadIdx.y + TILE_DIM_03T * blockIdx.y; + + if(m < M) { + const int ipa = LINDXU[m]; + + // Save to LDS to reduce global memory operation + s_W[threadIdx.x][threadIdx.y + 0] = + (n + 0 < N) ? A[ipa + (n + 0) * ((size_t)LDA)] : 0.0; + s_W[threadIdx.x][threadIdx.y + 8] = + (n + 8 < N) ? A[ipa + (n + 8) * ((size_t)LDA)] : 0.0; + s_W[threadIdx.x][threadIdx.y + 16] = + (n + 16 < N) ? A[ipa + (n + 16) * ((size_t)LDA)] : 0.0; + s_W[threadIdx.x][threadIdx.y + 24] = + (n + 24 < N) ? A[ipa + (n + 24) * ((size_t)LDA)] : 0.0; + } + + __syncthreads(); + + const int wm = threadIdx.y + TILE_DIM_03T * blockIdx.x; + const int wn = threadIdx.x + TILE_DIM_03T * blockIdx.y; + + if(wn < N) { + // write out chunks of W + if((wm + 0) < M) + W[wn + (wm + 0) * ((size_t)LDW)] = s_W[threadIdx.y + 0][threadIdx.x]; + if((wm + 8) < M) + W[wn + (wm + 8) * ((size_t)LDW)] = s_W[threadIdx.y + 8][threadIdx.x]; + if((wm + 16) < M) + W[wn + (wm + 16) * ((size_t)LDW)] = s_W[threadIdx.y + 16][threadIdx.x]; + if((wm + 24) < M) + W[wn + (wm + 24) * ((size_t)LDW)] = s_W[threadIdx.y + 24][threadIdx.x]; + } +} + +void HIP::HPL_dlaswp03T(const int M, const int N, double* A, const int LDA, double* W, const int LDW, const int* LINDXU) { if((M <= 0) || (N <= 0)) return; - hipStream_t stream; ROCBLAS_CHECK_STATUS(rocblas_get_stream(_handle, &stream)); - int grid_size = N; - hipLaunchKernelGGL((_dlaswp00N), dim3(grid_size), dim3(BLOCK_SIZE_00N), 0, stream, N, M, A, LDA, IPIV); + dim3 grid_size((M + TILE_DIM_03T - 1) / TILE_DIM_03T, (N + TILE_DIM_03T - 1) / TILE_DIM_03T); + dim3 block_size(TILE_DIM_03T, BLOCK_ROWS_03T); + hipLaunchKernelGGL((_dlaswp03T), grid_size, block_size, 0, stream, M, N, A, LDA, W, LDW, LINDXU); } +#define TILE_DIM_04T 32 +#define BLOCK_ROWS_04T 8 -#define TILE_DIM_01T 32 -#define BLOCK_ROWS_01T 8 - -/* Build U matrix from rows of A */ -__global__ void _dlaswp01T(const int M, const int N, double* __restrict__ A, const int LDA, double* __restrict__ U, const int LDU, const int* __restrict__ LINDXU) { +static __global__ void _dlaswp04T(const int M, const int N, double* __restrict__ A, const int LDA, double* __restrict__ W, const int LDW, const int* __restrict__ LINDXU) { - __shared__ double s_U[TILE_DIM_01T][TILE_DIM_01T + 1]; + __shared__ double s_W[TILE_DIM_04T][TILE_DIM_04T + 1]; - const int m = threadIdx.x + TILE_DIM_01T * blockIdx.x; - const int n = threadIdx.y + TILE_DIM_01T * blockIdx.y; + const int am = threadIdx.x + TILE_DIM_04T * blockIdx.x; + const int an = threadIdx.y + TILE_DIM_04T * blockIdx.y; - if(m < M) { - const int ipa = LINDXU[m]; + const int wm = threadIdx.y + TILE_DIM_04T * blockIdx.x; + const int wn = threadIdx.x + TILE_DIM_04T * blockIdx.y; - // Save to LDS to reduce global memory operation - s_U[threadIdx.x][threadIdx.y + 0] = - (n + 0 < N) ? A[ipa + (n + 0) * ((size_t)LDA)] : 0.0; - s_U[threadIdx.x][threadIdx.y + 8] = - (n + 8 < N) ? A[ipa + (n + 8) * ((size_t)LDA)] : 0.0; - s_U[threadIdx.x][threadIdx.y + 16] = - (n + 16 < N) ? A[ipa + (n + 16) * ((size_t)LDA)] : 0.0; - s_U[threadIdx.x][threadIdx.y + 24] = - (n + 24 < N) ? A[ipa + (n + 24) * ((size_t)LDA)] : 0.0; + if(wn < N) { + s_W[threadIdx.y + 0][threadIdx.x] = + (wm + 0 < M) ? W[wn + (wm + 0) * ((size_t)LDW)] : 0.0; + s_W[threadIdx.y + 8][threadIdx.x] = + (wm + 8 < M) ? W[wn + (wm + 8) * ((size_t)LDW)] : 0.0; + s_W[threadIdx.y + 16][threadIdx.x] = + (wm + 16 < M) ? W[wn + (wm + 16) * ((size_t)LDW)] : 0.0; + s_W[threadIdx.y + 24][threadIdx.x] = + (wm + 24 < M) ? W[wn + (wm + 24) * ((size_t)LDW)] : 0.0; } __syncthreads(); - const int um = threadIdx.y + TILE_DIM_01T * blockIdx.x; - const int un = threadIdx.x + TILE_DIM_01T * blockIdx.y; - - if(un < N) { - // write out chunks of U - if((um + 0) < M) - U[un + (um + 0) * ((size_t)LDU)] = s_U[threadIdx.y + 0][threadIdx.x]; - if((um + 8) < M) - U[un + (um + 8) * ((size_t)LDU)] = s_U[threadIdx.y + 8][threadIdx.x]; - if((um + 16) < M) - U[un + (um + 16) * ((size_t)LDU)] = s_U[threadIdx.y + 16][threadIdx.x]; - if((um + 24) < M) - U[un + (um + 24) * ((size_t)LDU)] = s_U[threadIdx.y + 24][threadIdx.x]; + if(am < M) { + const int aip = LINDXU[am]; + if((an + 0) < N) + A[aip + (an + 0) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 0]; + if((an + 8) < N) + A[aip + (an + 8) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 8]; + if((an + 16) < N) + A[aip + (an + 16) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 16]; + if((an + 24) < N) + A[aip + (an + 24) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 24]; } } -void HIP::HPL_dlaswp01T(const int M, const int N, double* A, const int LDA, double* U, const int LDU, const int* LINDXU) { +void HIP::HPL_dlaswp04T(const int M, const int N, double* A, const int LDA, double* W, const int LDW, const int* LINDXU) { + if((M <= 0) || (N <= 0)) return; + hipStream_t stream; + ROCBLAS_CHECK_STATUS(rocblas_get_stream(_handle, &stream)); + dim3 grid_size((M + TILE_DIM_04T - 1) / TILE_DIM_04T, (N + TILE_DIM_04T - 1) / TILE_DIM_04T); + dim3 block_size(TILE_DIM_04T, BLOCK_ROWS_04T); + hipLaunchKernelGGL((_dlaswp04T), grid_size, block_size, 0, stream, M, N, A, LDA, W, LDW, LINDXU); +} + +__global__ void _dlaswp10N(const int M, const int N, double* __restrict__ A, const int LDA, const int* __restrict__ IPIV) { + + const int m = threadIdx.x + blockDim.x * blockIdx.x; + + if (m < M) { + for (int i = 0; i < N; i++) { + const int ip = IPIV[i]; + if (ip != i) { + // swap rows + const double Ai = A[m + i * ((size_t)LDA)]; + const double Aip = A[m + ip * ((size_t)LDA)]; + A[m + i * ((size_t)LDA)] = Aip; + A[m + ip * ((size_t)LDA)] = Ai; + } + } + } +} +void HIP::HPL_dlaswp10N(const int M, const int N, double* A, const int LDA, const int* IPIV) { if((M <= 0) || (N <= 0)) return; hipStream_t stream; ROCBLAS_CHECK_STATUS(rocblas_get_stream(_handle, &stream)); - dim3 grid_size((M + TILE_DIM_01T - 1) / TILE_DIM_01T, (N + TILE_DIM_01T - 1) / TILE_DIM_01T); - dim3 block_size(TILE_DIM_01T, BLOCK_ROWS_01T); - hipLaunchKernelGGL((_dlaswp01T), grid_size, block_size, 0, stream, M, N, A, LDA, U, LDU, LINDXU); + + const int block_size_10N = 512; + + dim3 grid_size((M + block_size_10N - 1) / block_size_10N); + hipLaunchKernelGGL((_dlaswp10N), grid_size, dim3(block_size_10N), 0, stream, M, N, A, LDA, IPIV); } -/* Perform any local row swaps of A */ -__global__ void _dlaswp02T(const int M, const int N, double* __restrict__ A, const int LDA, const int* __restrict__ LINDXAU, const int* __restrict__ LINDXA) { +__global__ void setZero(const int N, double* __restrict__ X) { + const int t = threadIdx.x, b = blockIdx.x; + const size_t id = b * blockDim.x + t; // row id - const int n = blockIdx.x, m = threadIdx.x; + if(id < N) + X[id] = 0.0; +} - const int srow = LINDXAU[m]; //src row - const int drow = LINDXA[m]; //dst row +void HIP::HPL_set_zero(const int N, double* __restrict__ X) { + const int block_size = 512; + hipLaunchKernelGGL((setZero), dim3((N + block_size - 1) / block_size), dim3(block_size), 0, HIP::computeStream, N, X); +} - const double An = A[srow + n * ((size_t)LDA)]; - __syncthreads(); - A[drow + n * ((size_t)LDA)] = An; +// Setting the matrix section and phase of pdupdate +void HIP::HPL_pdlaswp_hip(HPL_T_panel* PANEL, int icurcol, std::list op_list) { + HPL_T_UPD UPD; + SWP_PHASE phase; + for (auto it = op_list.begin(); it != op_list.end(); ++it) { + const PDLASWP_OP op = *it; + if (op == SU0 || op == SU1 || op == SU2) phase = SWP_START; + else if (op == CU0 || op == CU1 || op == CU2) phase = SWP_COMM; + else if (op == EU0 || op == EU1 || op == EU2) phase = SWP_END; + else phase = SWP_NO; + + if (op == SU0 || op == CU0 || op == EU0) UPD = HPL_LOOK_AHEAD; + else if (op == SU1 || op == CU1 || op == EU1) UPD = HPL_UPD_1; + else if (op == SU2 || op == CU2 || op == EU2) UPD = HPL_UPD_2; + else UPD = HPL_N_UPD; + + if (UPD == HPL_LOOK_AHEAD && PANEL->grid->mycol != icurcol) + continue; + else + HPL_pdlaswp_hip(PANEL, UPD, phase); + } } -void HIP::HPL_dlaswp02T(const int M, const int N, double* A, const int LDA, const int* LINDXAU, const int* LINDXA) { +void HIP::pdlaswp_set_var(HPL_T_panel* PANEL, double* &dU, double* &U, int &ldu, double* &dW, double* &W, int &ldw, int &n, double* &dA, const HPL_T_UPD UPD) { + switch (UPD) { + case HPL_LOOK_AHEAD: + dU = PANEL->dU; U = PANEL->U; ldu = PANEL->ldu0; + dW = PANEL->dW; W = PANEL->W; ldw = PANEL->ldu0; + n = PANEL->nu0; + break; + case HPL_UPD_1: + dU = PANEL->dU1; U = PANEL->U1; ldu = PANEL->ldu1; + dW = PANEL->dW1; W = PANEL->W1; ldw = PANEL->ldu1; + n = PANEL->nu1; + dA = Mptr(dA, 0, PANEL->nu0, PANEL->dlda); + break; + case HPL_UPD_2: + dU = PANEL->dU2; U = PANEL->U2; ldu = PANEL->ldu2; + dW = PANEL->dW2; W = PANEL->W2; ldw = PANEL->ldu2; + n = PANEL->nu2; + dA = Mptr(dA, 0, PANEL->nu0 + PANEL->nu1, PANEL->dlda); + break; + default: + break; + } +} - if((M <= 0) || (N <= 0)) return; +void HIP::panel_init(HPL_T_grid *GRID, HPL_T_palg *ALGO, const int M, const int N, const int JB, + HPL_T_pmat *A, const int IA, const int JA, const int TAG, HPL_T_panel *PANEL) +{ + size_t dalign; + int icurcol, icurrow, ii, itmp1, jj, lwork, + ml2, mp, mycol, myrow, nb, npcol, nprow, + nq, nu, ldu; + /* .. + * .. Executable Statements .. + */ + PANEL->grid = GRID; /* ptr to the process grid */ + PANEL->algo = ALGO; /* ptr to the algo parameters */ + PANEL->pmat = A; /* ptr to the local array info */ - hipStream_t stream; - ROCBLAS_CHECK_STATUS(rocblas_get_stream(_handle, &stream)); - dim3 grid_size(N), block_size(M); - hipLaunchKernelGGL((_dlaswp02T), N, M, 0, stream, M, N, A, LDA, LINDXAU, LINDXA); -} + myrow = GRID->myrow; + mycol = GRID->mycol; + nprow = GRID->nprow; + npcol = GRID->npcol; + nb = A->nb; + + HPL_infog2l(IA, JA, nb, nb, nb, nb, 0, 0, myrow, mycol, + nprow, npcol, &ii, &jj, &icurrow, &icurcol); + mp = HPL_numrocI(M, IA, nb, nb, myrow, 0, nprow); + nq = HPL_numrocI(N, JA, nb, nb, mycol, 0, npcol); + + const int inxtcol = MModAdd1(icurcol, npcol); + const int inxtrow = MModAdd1(icurrow, nprow); + + /* ptr to trailing part of A */ + PANEL->A = A->A; + PANEL->dA = Mptr((double *)(A->d_A), ii, jj, A->ld); + + /* + * Workspace pointers are initialized to NULL. + */ + PANEL->L2 = nullptr; + PANEL->dL2 = nullptr; + PANEL->L1 = nullptr; + PANEL->dL1 = nullptr; + PANEL->DINFO = nullptr; + PANEL->U = nullptr; + PANEL->dU = nullptr; + PANEL->W = nullptr; + PANEL->dW = nullptr; + PANEL->U1 = nullptr; + PANEL->dU1 = nullptr; + PANEL->W1 = nullptr; + PANEL->dW1 = nullptr; + PANEL->U2 = nullptr; + PANEL->dU2 = nullptr; + PANEL->W2 = nullptr; + PANEL->dW2 = nullptr; + /* + * Local lengths, indexes process coordinates + */ + PANEL->nb = nb; /* distribution blocking factor */ + PANEL->jb = JB; /* panel width */ + PANEL->m = M; /* global # of rows of trailing part of A */ + PANEL->n = N; /* global # of cols of trailing part of A */ + PANEL->ia = IA; /* global row index of trailing part of A */ + PANEL->ja = JA; /* global col index of trailing part of A */ + PANEL->mp = mp; /* local # of rows of trailing part of A */ + PANEL->nq = nq; /* local # of cols of trailing part of A */ + PANEL->ii = ii; /* local row index of trailing part of A */ + PANEL->jj = jj; /* local col index of trailing part of A */ + PANEL->lda = Mmax(1, mp); /* local leading dim of array A */ + PANEL->dlda = A->ld; /* local leading dim of array A */ + PANEL->prow = icurrow; /* proc row owning 1st row of trailing A */ + PANEL->pcol = icurcol; /* proc col owning 1st col of trailing A */ + PANEL->msgid = TAG; /* message id to be used for panel bcast */ + /* + * Initialize ldl2 and len to temporary dummy values and Update tag for + * next panel + */ + PANEL->ldl2 = 0; /* local leading dim of array L2 */ + PANEL->dldl2 = 0; /* local leading dim of array L2 */ + PANEL->dldl1 = 1.02 * A->dN; // padding + PANEL->len = 0; /* length of the buffer to broadcast */ + PANEL->nu0 = 0; + PANEL->nu1 = 0; + PANEL->nu2 = 0; + PANEL->ldu0 = 0; + PANEL->ldu1 = 0; + PANEL->ldu2 = 0; -#define TILE_DIM_03T 32 -#define BLOCK_ROWS_03T 8 + + /*Split fraction*/ + const double fraction = 0.6; -/* Build W matrix from rows of A */ -__global__ void _dlaswp03T(const int M, const int N, double* __restrict__ A, const int LDA, double* __restrict__ W, const int LDW, const int* __restrict__ LINDXU) { + if ((double)M / A->dN > 0.97) { + HPL_ptimer_boot(); + HPL_ptimer( 0 ); + } + dalign = ALGO->align * sizeof(double); + size_t lpiv = (5 * JB * sizeof(int) + sizeof(double) - 1) / (sizeof(double)); - __shared__ double s_W[TILE_DIM_03T][TILE_DIM_03T + 1]; + if (npcol == 1) /* P x 1 process grid */ + { /* space for L1, DPIV, DINFO */ + lwork = ALGO->align + (PANEL->len = JB * JB + lpiv) + 1; + nu = Mmax(0, nq - JB); + ldu = nu + 256; /*extra space for padding*/ + lwork += JB * ldu; - const int m = threadIdx.x + TILE_DIM_03T * blockIdx.x; - const int n = threadIdx.y + TILE_DIM_03T * blockIdx.y; + if (PANEL->max_work_size < (size_t)(lwork) * sizeof(double)) + { + if (PANEL->WORK) + { + HIP_CHECK_ERROR(hipFree(PANEL->dWORK)); + HIP_CHECK_ERROR(hipHostFree(PANEL->WORK)); + } + size_t numbytes = (size_t)(lwork) * sizeof(double); - if(m < M) { - const int ipa = LINDXU[m]; + if (hipMalloc((void **)&(PANEL->dWORK), numbytes) != HIP_SUCCESS || + hipHostMalloc((void **)&(PANEL->WORK), numbytes, hipHostMallocDefault) != HIP_SUCCESS) + { + HPL_pabort(__LINE__, "HPL_pdpanel_init", "Memory allocation failed"); + } + PANEL->max_work_size = (size_t)(lwork) * sizeof(double); - // Save to LDS to reduce global memory operation - s_W[threadIdx.x][threadIdx.y + 0] = - (n + 0 < N) ? A[ipa + (n + 0) * ((size_t)LDA)] : 0.0; - s_W[threadIdx.x][threadIdx.y + 8] = - (n + 8 < N) ? A[ipa + (n + 8) * ((size_t)LDA)] : 0.0; - s_W[threadIdx.x][threadIdx.y + 16] = - (n + 16 < N) ? A[ipa + (n + 16) * ((size_t)LDA)] : 0.0; - s_W[threadIdx.x][threadIdx.y + 24] = - (n + 24 < N) ? A[ipa + (n + 24) * ((size_t)LDA)] : 0.0; - } +#ifdef HPL_VERBOSE_PRINT + if ((myrow == 0) && (mycol == 0)) + { + printf("Allocating %g GBs of storage on CPU...", + ((double)numbytes) / (1024 * 1024 * 1024)); + fflush(stdout); - __syncthreads(); + printf("done.\n"); + printf("Allocating %g GBs of storage on GPU...", + ((double)numbytes) / (1024 * 1024 * 1024)); + fflush(stdout); + printf("done.\n"); + } +#endif + } + /* + * Initialize the pointers of the panel structure - Always re-use A in + * the only process column + */ + PANEL->ldl2 = Mmax(1, mp); + PANEL->dldl2 = A->ld; + PANEL->dL2 = PANEL->dA + (myrow == icurrow ? JB : 0); + PANEL->L2 = PANEL->A + (myrow == icurrow ? JB : 0); + PANEL->U = (double *)PANEL->WORK; + PANEL->dU = (double *)PANEL->dWORK; + PANEL->L1 = (double *)PANEL->WORK + (JB * Mmax(0, ldu)); + PANEL->dL1 = (double *)PANEL->dWORK + (JB * Mmax(0, ldu)); + PANEL->W = A->W; + PANEL->dW = A->dW; - const int wm = threadIdx.y + TILE_DIM_03T * blockIdx.x; - const int wn = threadIdx.x + TILE_DIM_03T * blockIdx.y; + if (nprow == 1) + { + PANEL->nu0 = Mmin(JB, nu); + PANEL->ldu0 = PANEL->nu0; - if(wn < N) { - // write out chunks of W - if((wm + 0) < M) - W[wn + (wm + 0) * ((size_t)LDW)] = s_W[threadIdx.y + 0][threadIdx.x]; - if((wm + 8) < M) - W[wn + (wm + 8) * ((size_t)LDW)] = s_W[threadIdx.y + 8][threadIdx.x]; - if((wm + 16) < M) - W[wn + (wm + 16) * ((size_t)LDW)] = s_W[threadIdx.y + 16][threadIdx.x]; - if((wm + 24) < M) - W[wn + (wm + 24) * ((size_t)LDW)] = s_W[threadIdx.y + 24][threadIdx.x]; - } -} + PANEL->nu1 = 0; + PANEL->ldu1 = 0; -void HIP::HPL_dlaswp03T(const int M, const int N, double* A, const int LDA, double* W, const int LDW, const int* LINDXU) { + PANEL->nu2 = nu - PANEL->nu0; + PANEL->ldu2 = ((PANEL->nu2 + 95) / 128) * 128 + 32; /*pad*/ - if((M <= 0) || (N <= 0)) return; - hipStream_t stream; - ROCBLAS_CHECK_STATUS(rocblas_get_stream(_handle, &stream)); - dim3 grid_size((M + TILE_DIM_03T - 1) / TILE_DIM_03T, (N + TILE_DIM_03T - 1) / TILE_DIM_03T); - dim3 block_size(TILE_DIM_03T, BLOCK_ROWS_03T); - hipLaunchKernelGGL((_dlaswp03T), grid_size, block_size, 0, stream, M, N, A, LDA, W, LDW, LINDXU); -} + PANEL->U1 = PANEL->U + PANEL->ldu0 * JB; + PANEL->dU1 = PANEL->dU + PANEL->ldu0 * JB; + PANEL->U2 = PANEL->U1 + PANEL->ldu1 * JB; + PANEL->dU2 = PANEL->dU1 + PANEL->ldu1 * JB; -#define TILE_DIM_04T 32 -#define BLOCK_ROWS_04T 8 + PANEL->permU = (int *)(PANEL->L1 + JB * JB); + PANEL->dpermU = (int *)(PANEL->dL1 + JB * JB); + PANEL->ipiv = PANEL->permU + JB; + PANEL->dipiv = PANEL->dpermU + JB; -static __global__ void _dlaswp04T(const int M, const int N, double* __restrict__ A, const int LDA, double* __restrict__ W, const int LDW, const int* __restrict__ LINDXU) { + PANEL->DINFO = (double *)(PANEL->ipiv + 2 * JB); + PANEL->dDINFO = (double *)(PANEL->dipiv + 2 * JB); + } + else + { + const int NSplit = Mmax(0, ((((int)(A->nq * fraction)) / nb) * nb)); + PANEL->nu0 = Mmin(JB, nu); + PANEL->ldu0 = PANEL->nu0; - __shared__ double s_W[TILE_DIM_04T][TILE_DIM_04T + 1]; + PANEL->nu2 = Mmin(nu - PANEL->nu0, NSplit); + PANEL->ldu2 = ((PANEL->nu2 + 95) / 128) * 128 + 32; /*pad*/ - const int am = threadIdx.x + TILE_DIM_04T * blockIdx.x; - const int an = threadIdx.y + TILE_DIM_04T * blockIdx.y; + PANEL->nu1 = nu - PANEL->nu0 - PANEL->nu2; + PANEL->ldu1 = ((PANEL->nu1 + 95) / 128) * 128 + 32; /*pad*/ - const int wm = threadIdx.y + TILE_DIM_04T * blockIdx.x; - const int wn = threadIdx.x + TILE_DIM_04T * blockIdx.y; + PANEL->U1 = PANEL->U + PANEL->ldu0 * JB; + PANEL->dU1 = PANEL->dU + PANEL->ldu0 * JB; + PANEL->U2 = PANEL->U1 + PANEL->ldu1 * JB; + PANEL->dU2 = PANEL->dU1 + PANEL->ldu1 * JB; - if(wn < N) { - s_W[threadIdx.y + 0][threadIdx.x] = - (wm + 0 < M) ? W[wn + (wm + 0) * ((size_t)LDW)] : 0.0; - s_W[threadIdx.y + 8][threadIdx.x] = - (wm + 8 < M) ? W[wn + (wm + 8) * ((size_t)LDW)] : 0.0; - s_W[threadIdx.y + 16][threadIdx.x] = - (wm + 16 < M) ? W[wn + (wm + 16) * ((size_t)LDW)] : 0.0; - s_W[threadIdx.y + 24][threadIdx.x] = - (wm + 24 < M) ? W[wn + (wm + 24) * ((size_t)LDW)] : 0.0; - } + PANEL->W1 = PANEL->W + PANEL->ldu0 * JB; + PANEL->dW1 = PANEL->dW + PANEL->ldu0 * JB; + PANEL->W2 = PANEL->W1 + PANEL->ldu1 * JB; + PANEL->dW2 = PANEL->dW1 + PANEL->ldu1 * JB; - __syncthreads(); + PANEL->lindxA = (int *)(PANEL->L1 + JB * JB); + PANEL->dlindxA = (int *)(PANEL->dL1 + JB * JB); + PANEL->lindxAU = PANEL->lindxA + JB; + PANEL->dlindxAU = PANEL->dlindxA + JB; + PANEL->lindxU = PANEL->lindxAU + JB; + PANEL->dlindxU = PANEL->dlindxAU + JB; + PANEL->permU = PANEL->lindxU + JB; + PANEL->dpermU = PANEL->dlindxU + JB; - if(am < M) { - const int aip = LINDXU[am]; - if((an + 0) < N) - A[aip + (an + 0) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 0]; - if((an + 8) < N) - A[aip + (an + 8) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 8]; - if((an + 16) < N) - A[aip + (an + 16) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 16]; - if((an + 24) < N) - A[aip + (an + 24) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 24]; + // Put ipiv array at the end + PANEL->dipiv = PANEL->dpermU + JB; + PANEL->ipiv = PANEL->permU + JB; + + PANEL->DINFO = ((double *)PANEL->lindxA) + lpiv; + PANEL->dDINFO = ((double *)PANEL->dlindxA) + lpiv; + } + + *(PANEL->DINFO) = 0.0; } -} + else // for ncol != 1 + { /* space for L2, L1, DPIV */ + ml2 = (myrow == icurrow ? mp - JB : mp); + ml2 = Mmax(0, ml2); + ml2 = ((ml2 + 95) / 128) * 128 + 32; /*pad*/ + itmp1 = JB * JB + lpiv; // L1, integer arrays + PANEL->len = ml2 * JB + itmp1; -void HIP::HPL_dlaswp04T(const int M, const int N, double* A, const int LDA, double* W, const int LDW, const int* LINDXU) { - if((M <= 0) || (N <= 0)) return; - hipStream_t stream; - ROCBLAS_CHECK_STATUS(rocblas_get_stream(_handle, &stream)); - dim3 grid_size((M + TILE_DIM_04T - 1) / TILE_DIM_04T, (N + TILE_DIM_04T - 1) / TILE_DIM_04T); - dim3 block_size(TILE_DIM_04T, BLOCK_ROWS_04T); - hipLaunchKernelGGL((_dlaswp04T), grid_size, block_size, 0, stream, M, N, A, LDA, W, LDW, LINDXU); -} + lwork = ALGO->align + PANEL->len + 1; -__global__ void _dlaswp10N(const int M, const int N, double* __restrict__ A, const int LDA, const int* __restrict__ IPIV) { + nu = Mmax(0, (mycol == icurcol ? nq - JB : nq)); + ldu = nu + 256; /*extra space for potential padding*/ - const int m = threadIdx.x + blockDim.x * blockIdx.x; + // if( nprow > 1 ) /* space for U */ + { + lwork += JB * ldu; + } + if (PANEL->max_work_size < (size_t)(lwork) * sizeof(double)) + { + if (PANEL->WORK) + { + HIP_CHECK_ERROR(hipFree(PANEL->dWORK)); + HIP_CHECK_ERROR(hipHostFree(PANEL->WORK)); + } + size_t numbytes = (size_t)(lwork) * sizeof(double); - if (m < M) { - for (int i = 0; i < N; i++) { - const int ip = IPIV[i]; - if (ip != i) { - // swap rows - const double Ai = A[m + i * ((size_t)LDA)]; - const double Aip = A[m + ip * ((size_t)LDA)]; - A[m + i * ((size_t)LDA)] = Aip; - A[m + ip * ((size_t)LDA)] = Ai; + if (hipMalloc((void **)&(PANEL->dWORK), numbytes) != HIP_SUCCESS || + hipHostMalloc((void **)&(PANEL->WORK), numbytes, hipHostMallocDefault) != HIP_SUCCESS) + { + HPL_pabort(__LINE__, "HPL_pdpanel_init", "Memory allocation failed"); + } + PANEL->max_work_size = (size_t)(lwork) * sizeof(double); +#ifdef HPL_VERBOSE_PRINT + if ((myrow == 0) && (mycol == 0)) + { + printf("Allocating %g GBs of storage on CPU...", + ((double)numbytes) / (1024 * 1024 * 1024)); + fflush(stdout); + printf("done.\n"); + printf("Allocating %g GBs of storage on GPU...", + ((double)numbytes) / (1024 * 1024 * 1024)); + fflush(stdout); + printf("done.\n"); } +#endif } - } -} - -void HIP::HPL_dlaswp10N(const int M, const int N, double* A, const int LDA, const int* IPIV) { - if((M <= 0) || (N <= 0)) return; - - hipStream_t stream; - ROCBLAS_CHECK_STATUS(rocblas_get_stream(_handle, &stream)); + /* + * Initialize the pointers of the panel structure - Re-use A in the cur- + * rent process column when HPL_COPY_L is not defined. + */ + PANEL->U = (double *)PANEL->WORK; + PANEL->dU = (double *)PANEL->dWORK; - const int block_size_10N = 512; + PANEL->W = A->W; + PANEL->dW = A->dW; - dim3 grid_size((M + block_size_10N - 1) / block_size_10N); - hipLaunchKernelGGL((_dlaswp10N), grid_size, dim3(block_size_10N), 0, stream, M, N, A, LDA, IPIV); -} + PANEL->L2 = (double *)PANEL->WORK + (JB * Mmax(0, ldu)); + PANEL->dL2 = (double *)PANEL->dWORK + (JB * Mmax(0, ldu)); + PANEL->L1 = PANEL->L2 + ml2 * JB; + PANEL->dL1 = PANEL->dL2 + ml2 * JB; + PANEL->ldl2 = Mmax(1, ml2); + PANEL->dldl2 = Mmax(1, ml2); -__global__ void setZero(const int N, double* __restrict__ X) { - const int t = threadIdx.x, b = blockIdx.x; - const size_t id = b * blockDim.x + t; // row id + if (nprow == 1) + { + PANEL->nu0 = (mycol == inxtcol) ? Mmin(JB, nu) : 0; + PANEL->ldu0 = PANEL->nu0; - if(id < N) - X[id] = 0.0; -} + PANEL->nu1 = 0; + PANEL->ldu1 = 0; -void HIP::HPL_set_zero(const int N, double* __restrict__ X) { - const int block_size = 512; - hipLaunchKernelGGL((setZero), dim3((N + block_size - 1) / block_size), dim3(block_size), 0, HIP::computeStream, N, X); -} + PANEL->nu2 = nu - PANEL->nu0; + PANEL->ldu2 = ((PANEL->nu2 + 95) / 128) * 128 + 32; /*pad*/ -void HIP::HPL_pdlaswp_hip(HPL_T_panel* PANEL, const HPL_T_UPD UPD, const SWP_PHASE phase) { - double *U, *W; - double *dA, *dU, *dW; - int *ipID, *iplen, *ipcounts, *ipoffsets, *iwork, *lindxU = NULL, *lindxA = NULL, *lindxAU, *permU; - int *dlindxU = NULL, *dlindxA = NULL, *dlindxAU, *dpermU, *dpermU_ex; - int icurrow, *iflag, *ipA, *ipl, jb, k, lda, myrow, n, nprow, LDU, LDW; - MPI_Comm comm; + PANEL->U1 = PANEL->U + PANEL->ldu0 * JB; + PANEL->dU1 = PANEL->dU + PANEL->ldu0 * JB; + PANEL->U2 = PANEL->U1 + PANEL->ldu1 * JB; + PANEL->dU2 = PANEL->dU1 + PANEL->ldu1 * JB; - /* - * Retrieve parameters from the PANEL data structure - */ - n = PANEL->n; jb = PANEL->jb; - nprow = PANEL->grid->nprow; myrow = PANEL->grid->myrow; - comm = PANEL->grid->col_comm; icurrow = PANEL->prow; - iflag = PANEL->IWORK; - dA = PANEL->dA; lda = PANEL->dlda; + PANEL->permU = (int *)(PANEL->L1 + JB * JB); + PANEL->dpermU = (int *)(PANEL->dL1 + JB * JB); + PANEL->ipiv = PANEL->permU + JB; + PANEL->dipiv = PANEL->dpermU + JB; - // Quick return if we're 1xQ - if(phase != SWP_END && nprow == 1) return; + PANEL->DINFO = (double *)(PANEL->ipiv + 2 * JB); + PANEL->dDINFO = (double *)(PANEL->dipiv + 2 * JB); + } + else + { + const int NSplit = Mmax(0, ((((int)(A->nq * fraction)) / nb) * nb)); + PANEL->nu0 = (mycol == inxtcol) ? Mmin(JB, nu) : 0; + PANEL->ldu0 = PANEL->nu0; - pdlaswp_set_var(PANEL, dU, U, LDU, dW, W, LDW, n, dA, UPD); + PANEL->nu2 = Mmin(nu - PANEL->nu0, NSplit); + PANEL->ldu2 = ((PANEL->nu2 + 95) / 128) * 128 + 32; /*pad*/ - /* Quick return if there is nothing to do */ - if((n <= 0) || (jb <= 0)) return; + PANEL->nu1 = nu - PANEL->nu0 - PANEL->nu2; + PANEL->ldu1 = ((PANEL->nu1 + 95) / 128) * 128 + 32; /*pad*/ - // Quick swapping if P==1 - if (phase == SWP_END && nprow == 1) { - // wait for swapping data to arrive - HPL_BE_stream_wait_event(HPL_COMPUTESTREAM, SWAPDATATRANSFER, HPL_TR); + PANEL->U1 = PANEL->U + PANEL->ldu0 * JB; + PANEL->dU1 = PANEL->dU + PANEL->ldu0 * JB; + PANEL->U2 = PANEL->U1 + PANEL->ldu1 * JB; + PANEL->dU2 = PANEL->dU1 + PANEL->ldu1 * JB; - HIP::HPL_dlaswp00N(jb, n, dA, lda, PANEL->dipiv); - return; - } + PANEL->W1 = PANEL->W + PANEL->ldu0 * JB; + PANEL->dW1 = PANEL->dW + PANEL->ldu0 * JB; + PANEL->W2 = PANEL->W1 + PANEL->ldu1 * JB; + PANEL->dW2 = PANEL->dW1 + PANEL->ldu1 * JB; - /* - * Compute ipID (if not already done for this panel). lindxA and lindxAU - * are of length at most 2*jb - iplen is of size nprow+1, ipmap, ipmapm1 - * are of size nprow, permU is of length jb, and this function needs a - * workspace of size max( 2 * jb (plindx1), nprow+1(equil)): - * 1(iflag) + 1(ipl) + 1(ipA) + 9*jb + 3*nprow + 1 + MAX(2*jb,nprow+1) - * i.e. 4 + 9*jb + 3*nprow + max(2*jb, nprow+1); - */ - k = (int)((unsigned int)(jb) << 1); - ipl = iflag + 1; - ipID = ipl + 1; - ipA = ipID + ((unsigned int)(k) << 1); - iplen = ipA + 1; - ipcounts = iplen + nprow + 1; - ipoffsets = ipcounts + nprow; - iwork = ipoffsets + nprow; + PANEL->lindxA = (int *)(PANEL->L1 + JB * JB); + PANEL->dlindxA = (int *)(PANEL->dL1 + JB * JB); + PANEL->lindxAU = PANEL->lindxA + JB; + PANEL->dlindxAU = PANEL->dlindxA + JB; + PANEL->lindxU = PANEL->lindxAU + JB; + PANEL->dlindxU = PANEL->dlindxAU + JB; + PANEL->permU = PANEL->lindxU + JB; + PANEL->dpermU = PANEL->dlindxU + JB; - if (phase == SWP_START) { - if(*iflag == -1) {/* no index arrays have been computed so far */ - // get the ipivs on the host after the Bcast - if(PANEL->grid->mycol != PANEL->pcol) { - HIP_CHECK_ERROR(hipMemcpy2DAsync(PANEL->ipiv, PANEL->jb * sizeof(int), - PANEL->dipiv, PANEL->jb * sizeof(int), - PANEL->jb * sizeof(int), 1, - hipMemcpyDeviceToHost, HIP::dataStream)); - } - HPL_BE_stream_synchronize(HPL_DATASTREAM, HPL_TR); + // Put ipiv array at the end + PANEL->ipiv = PANEL->permU + JB; + PANEL->dipiv = PANEL->dpermU + JB; - // compute spreading info - HPL_pipid(PANEL, ipl, ipID); - HPL_plindx(PANEL, *ipl, ipID, ipA, PANEL->lindxU, PANEL->lindxAU, PANEL->lindxA, iplen, PANEL->permU, iwork); - *iflag = 1; + PANEL->DINFO = ((double *)PANEL->lindxA) + lpiv; + PANEL->dDINFO = ((double *)PANEL->dlindxA) + lpiv; } - /* - * For i in [0..2*jb), lindxA[i] is the offset in A of a row that ulti- - * mately goes to U( :, lindxAU[i] ). In each rank, we directly pack - * into U, otherwise we pack into workspace. The first - * entry of each column packed in workspace is in fact the row or column - * offset in U where it should go to. - */ - if(myrow == icurrow) { - // copy needed rows of A into U - HIP::HPL_dlaswp01T(jb, n, dA, lda, dU, LDU, PANEL->dlindxU); - // record the evernt when packing completes - HIP::event_record(SWAPSTART, UPD); - } else { - // copy needed rows from A into U(:, iplen[myrow]) - HIP::HPL_dlaswp03T(iplen[myrow + 1] - iplen[myrow], n, dA, lda, Mptr(dU, 0, iplen[myrow], LDU), LDU, PANEL->dlindxU); - // record the event when packing completes - HIP::event_record(SWAPSTART, UPD); - } + *(PANEL->DINFO) = 0.0; } - else if (phase == SWP_COMM) { - /* Set MPI message counts and offsets */ - ipcounts[0] = (iplen[1] - iplen[0]) * LDU; - ipoffsets[0] = 0; - for(int i = 1; i < nprow; ++i) { - ipcounts[i] = (iplen[i + 1] - iplen[i]) * LDU; - ipoffsets[i] = ipcounts[i - 1] + ipoffsets[i - 1]; - } + if (nprow == 1) + { + lwork = mp + JB; + } + else + { + itmp1 = (JB << 1); + lwork = nprow + 1; + itmp1 = Mmax(itmp1, lwork); + lwork = mp + 4 + (5 * JB) + (3 * nprow) + itmp1; + } - if(myrow == icurrow) { - HIP::event_synchronize(SWAPSTART, UPD); - // Send rows info to other ranks - HPL_scatterv(dU, ipcounts, ipoffsets, ipcounts[myrow], icurrow, comm); - // All gather dU (gather + broadcast) - HPL_allgatherv(dU, ipcounts[myrow], ipcounts, ipoffsets, comm); - } else { - // Wait for dU to be ready - HIP::event_synchronize(SWAPSTART, UPD); - // Receive rows from icurrow into dW - HPL_scatterv(dW, ipcounts, ipoffsets, ipcounts[myrow], icurrow, comm); - // All gather dU - HPL_allgatherv(dU, ipcounts[myrow], ipcounts, ipoffsets, comm); + if (PANEL->max_iwork_size < (size_t)(lwork) * sizeof(int)) + { + if (PANEL->IWORK) + { + std::free(PANEL->IWORK); } - } - else if (phase == SWP_END) { - if(myrow == icurrow) { - // Swap rows local to A on device - HIP::HPL_dlaswp02T(*ipA, n, dA, lda, PANEL->dlindxAU, PANEL->dlindxA); - } else { - // Queue inserting recieved rows in W into A on device - HIP::HPL_dlaswp04T(iplen[myrow + 1] - iplen[myrow], n, dA, lda, dW, LDW, PANEL->dlindxU); + size_t numbytes = (size_t)(lwork) * sizeof(int); + PANEL->IWORK = (int *)std::malloc(numbytes); + if (PANEL->IWORK == NULL) + { + HPL_pabort(__LINE__, "HPL_pdpanel_init", "Panel Host Integer Memory allocation failed"); } - /* Permute U in every process row */ - HIP::HPL_dlaswp10N(n, jb, dU, LDU, PANEL->dpermU); + PANEL->max_iwork_size = (size_t)(lwork) * sizeof(int); } -} + if (lwork) + *(PANEL->IWORK) = -1; -// Setting the matrix section and phase of pdupdate -void HIP::HPL_pdlaswp_hip(HPL_T_panel* PANEL, int icurcol, std::list op_list) { - HPL_T_UPD UPD; - SWP_PHASE phase; - for (auto it = op_list.begin(); it != op_list.end(); ++it) { - const PDLASWP_OP op = *it; - if (op == SU0 || op == SU1 || op == SU2) phase = SWP_START; - else if (op == CU0 || op == CU1 || op == CU2) phase = SWP_COMM; - else if (op == EU0 || op == EU1 || op == EU2) phase = SWP_END; - else phase = SWP_NO; - - if (op == SU0 || op == CU0 || op == EU0) UPD = HPL_LOOK_AHEAD; - else if (op == SU1 || op == CU1 || op == EU1) UPD = HPL_UPD_1; - else if (op == SU2 || op == CU2 || op == EU2) UPD = HPL_UPD_2; - else UPD = HPL_N_UPD; + /* ensure the temp buffer in HPL_pdfact is allocated once*/ + lwork = (size_t)(((4 + ((unsigned int)(JB) << 1)) << 1)); + if (PANEL->max_fwork_size < (size_t)(lwork) * sizeof(double)) + { + if (PANEL->fWORK) + { + HIP_CHECK_ERROR(hipHostFree(PANEL->fWORK)); + } + size_t numbytes = (size_t)(lwork) * sizeof(double); - if (UPD == HPL_LOOK_AHEAD && PANEL->grid->mycol != icurcol) - continue; - else - HPL_pdlaswp_hip(PANEL, UPD, phase); + HIP_CHECK_ERROR(hipHostMalloc((void **)&PANEL->fWORK, numbytes)); + if (PANEL->fWORK == NULL) + { + HPL_pabort(__LINE__, "HPL_pdpanel_init", "Panel Host pdfact Scratch Memory allocation failed"); + } + PANEL->max_fwork_size = (size_t)(lwork) * sizeof(double); } + /* + * End of HPL_pdpanel_init + */ } -void HIP::pdlaswp_set_var(HPL_T_panel* PANEL, double* &dU, double* &U, int &ldu, double* &dW, double* &W, int &ldw, int &n, double* &dA, const HPL_T_UPD UPD) { - switch (UPD) { - case HPL_LOOK_AHEAD: - dU = PANEL->dU; U = PANEL->U; ldu = PANEL->ldu0; - dW = PANEL->dW; W = PANEL->W; ldw = PANEL->ldu0; - n = PANEL->nu0; - break; - case HPL_UPD_1: - dU = PANEL->dU1; U = PANEL->U1; ldu = PANEL->ldu1; - dW = PANEL->dW1; W = PANEL->W1; ldw = PANEL->ldu1; - n = PANEL->nu1; - dA = Mptr(dA, 0, PANEL->nu0, PANEL->dlda); - break; - case HPL_UPD_2: - dU = PANEL->dU2; U = PANEL->U2; ldu = PANEL->ldu2; - dW = PANEL->dW2; W = PANEL->W2; ldw = PANEL->ldu2; - n = PANEL->nu2; - dA = Mptr(dA, 0, PANEL->nu0 + PANEL->nu1, PANEL->dlda); - break; - default: - break; - } -} \ No newline at end of file diff --git a/testing/backend/HPL_backendWrapper.cpp b/testing/backend/HPL_backendWrapper.cpp index 2d51ef3..0524317 100644 --- a/testing/backend/HPL_backendWrapper.cpp +++ b/testing/backend/HPL_backendWrapper.cpp @@ -6,14 +6,14 @@ extern "C" { /* * Initialize the Target device */ - void HPL_BE_init(size_t num_devices, enum HPL_TARGET TR) + void HPL_BE_init(const HPL_T_grid* GRID, enum HPL_TARGET TR) { switch(TR) { case T_CPU: DO_NOTHING(); break; case T_HIP: - HPL::dispatch(HIP::init, num_devices); + HPL::dispatch(HIP::init, GRID); break; default: DO_NOTHING(); diff --git a/testing/ptest/HPL_pddriver.cpp b/testing/ptest/HPL_pddriver.cpp index 0b428c6..f942e8e 100644 --- a/testing/ptest/HPL_pddriver.cpp +++ b/testing/ptest/HPL_pddriver.cpp @@ -99,7 +99,7 @@ int main( ARGC, ARGV ) inbm, indh, indv, ipfa, ipq, irfa, itop, mycol, myrow, ns, nbs, nbms, ndhs, ndvs, npcol, npfs, npqs, nprow, nrfs, ntps, - rank, size, tswap; + rank, size, tswap, p, q; HPL_T_ORDER pmapping; HPL_T_FACT rpfa; HPL_T_SWAP fswap; @@ -147,16 +147,9 @@ int main( ARGC, ARGV ) * 1 Equilibration (0=no,1=yes) * 8 memory alignment in double (> 0) */ - HPL_pdinfo( &test, &ns, nval, &nbs, nbval, &pmapping, &npqs, pval, qval, - &npfs, pfaval, &nbms, nbmval, &ndvs, ndvval, &nrfs, rfaval, - &ntps, topval, &ndhs, ndhval, &fswap, &tswap, &L1notran, - &Unotran, &equil, &align ); - - /* - * Init Target Device - */ - HPL_BE_init(1, HPL_TR); - + HPL_pdinfo( ARGC, ARGV, &test, &ns, nval, &nbs, nbval, &pmapping, &npqs, + pval, qval, &p, &q, &npfs, pfaval, &nbms, nbmval, &ndvs, ndvval, &nrfs, rfaval, &ntps, topval, &ndhs, ndhval, &fswap, + &tswap, &L1notran, &Unotran, &equil, &align ); /* * Loop over different process grids - Define process grid. Go to bottom * of process grid loop if this case does not use my process. @@ -164,11 +157,13 @@ int main( ARGC, ARGV ) for( ipq = 0; ipq < npqs; ipq++ ) { (void) HPL_grid_init( MPI_COMM_WORLD, pmapping, pval[ipq], qval[ipq], - &grid ); + p, q, &grid ); (void) HPL_grid_info( &grid, &nprow, &npcol, &myrow, &mycol ); if( ( myrow < 0 ) || ( myrow >= nprow ) || ( mycol < 0 ) || ( mycol >= npcol ) ) goto label_end_of_npqs; + + HPL_BE_init(&grid, HPL_TR); for( in = 0; in < ns; in++ ) { /* Loop over various problem sizes */ diff --git a/testing/ptest/HPL_pdinfo.cpp b/testing/ptest/HPL_pdinfo.cpp index 4eaab51..4938093 100644 --- a/testing/ptest/HPL_pdinfo.cpp +++ b/testing/ptest/HPL_pdinfo.cpp @@ -52,6 +52,8 @@ #ifdef STDC_HEADERS void HPL_pdinfo ( + int ARGC, + char** ARGV, HPL_T_test * TEST, int * NS, int * N, @@ -61,6 +63,8 @@ void HPL_pdinfo int * NPQS, int * P, int * Q, + int * p, + int * q, int * NPFS, HPL_T_FACT * PF, int * NBMS, @@ -274,7 +278,7 @@ void HPL_pdinfo int * iwork = NULL; char * lineptr; int error=0, fid, i, j, lwork, maxp, nprocs, - rank, size; + rank, size, _p = -1, _q = -1; /* .. * .. Executable Statements .. */ @@ -289,6 +293,20 @@ void HPL_pdinfo * Process 0 reads the input data, broadcasts to other processes and * writes needed information to TEST->outfp. */ + + for(int i = 1; i < ARGC; i++) { + if(strcmp(ARGV[i], "-p") == 0) { + _p = atoi(ARGV[i + 1]); + i++; + } + if(strcmp(ARGV[i], "-q") == 0) { + _q = atoi(ARGV[i + 1]); + i++; + } + } + *p = _p; + *q = _q; + char* status; if( rank == 0 ) { @@ -346,6 +364,7 @@ void HPL_pdinfo error = 1; goto label_error; } } + /* * Block size (>=1) (NB) */ diff --git a/testing/ptest/HPL_pdtest.cpp b/testing/ptest/HPL_pdtest.cpp index eebc99e..0b118c1 100644 --- a/testing/ptest/HPL_pdtest.cpp +++ b/testing/ptest/HPL_pdtest.cpp @@ -158,7 +158,7 @@ void HPL_pdtest /* * generate matrix and right-hand-side, [ A | b ] which is N by N+1. */ - MPI_Type_contiguous(2*NB+4, MPI_DOUBLE, &PDFACT_ROW); + MPI_Type_contiguous(NB+4, MPI_DOUBLE, &PDFACT_ROW); MPI_Type_commit(&PDFACT_ROW); HPL_BE_dmatgen(GRID, N, N+1, NB, mat.d_A, mat.ld, HPL_ISEED, T_HIP); From 6d0d8896a0c537d28905d2c594048ce393bdebb6 Mon Sep 17 00:00:00 2001 From: Redtorm Date: Mon, 12 Sep 2022 08:22:29 +0200 Subject: [PATCH 2/3] change the process mapping to column major --- CMakeLists.txt | 39 ++++++++++++++++++------------ scripts/config/HPL_16GPU.dat | 2 +- scripts/config/HPL_1GPU.dat | 2 +- scripts/config/HPL_2GPU.dat | 2 +- scripts/config/HPL_32GPU.dat | 2 +- scripts/config/HPL_4GPU.dat | 2 +- scripts/config/HPL_8GPU.dat | 4 +-- scripts/env.mun.sh | 7 ++++++ scripts/mpirun_xhplhip.sh | 7 ++++-- scripts/run_hpl.slurm | 38 +++++++++++++++++------------ scripts/run_xhplhip.sh | 7 +++--- src/grid/HPL_grid_init.cpp | 2 +- testing/backend/HPL_backendHIP.cpp | 8 +++--- testing/ptest/HPL_pddriver.cpp | 2 +- testing/ptest/HPL_pdtest.cpp | 2 +- 15 files changed, 77 insertions(+), 49 deletions(-) create mode 100755 scripts/env.mun.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 84b31a6..f594214 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,10 +20,18 @@ EXECUTE_PROCESS( find_package(OpenMP) # Add rocM root dir to CMAKE_PREFIX_PATH, usually /opt/rocm -list(APPEND CMAKE_PREFIX_PATH "/opt/rocm") -include(/opt/rocm/lib/cmake/hip/FindHIP.cmake) -include(/opt/rocm/share/rocm/cmake/ROCMCheckTargetIds.cmake) -find_package( hip REQUIRED ) +if(NOT DEFINED ROCM_PATH) + if(DEFINED ENV{ROCM_PATH}) + set(ROCM_PATH $ENV{ROCM_PATH}) + else() + set(ROCM_PATH "/opt/rocm") + endif() +endif() +list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}) +set(CMAKE_MODULE_PATH "${ROCM_PATH}/hip/cmake" ${CMAKE_MODULE_PATH}) +set(CMAKE_MODULE_PATH "${ROCM_PATH}/share/rocm/cmake" ${CMAKE_MODULE_PATH}) +include(ROCMCheckTargetIds) +find_package( HIP REQUIRED ) find_package( rocblas REQUIRED ) # switch compiler and linker on non-Windows @@ -83,15 +91,6 @@ else () endif () endif () -# find_library(BLAS_LIBRARIES NAMES blis -# PATHS ${BLAS_DIR} -# NO_DEFAULT_PATH) -# if (BLAS_LIBRARIES) -# message(STATUS "Found BLAS: ${BLAS_LIBRARIES}") -# else() -# find_package(BLAS REQUIRED) -# endif() - if(NOT DEFINED BLAS_DIR) if(DEFINED ENV{BLAS_DIR}) set(BLAS_DIR $ENV{BLAS_DIR}) @@ -100,7 +99,18 @@ if(NOT DEFINED BLAS_DIR) else() list(APPEND CMAKE_PREFIX_PATH ${BLAS_DIR}) endif() -find_package( BLAS REQUIRED ) + +find_library(BLAS_LIBRARIES NAMES blis openblas + PATHS ${BLAS_DIR} + HINTS ${BLAS_DIR}/lib/zen3 ${BLAS_DIR}/lib + NO_DEFAULT_PATH) +if (BLAS_LIBRARIES) + message(STATUS "Found BLAS: ${BLAS_LIBRARIES}") +else() + message(STATUS "BLAS NOT Found: ${BLAS_LIBRARIES}") + find_package(BLAS REQUIRED) +endif() +# find_package( BLAS REQUIRED ) # append math library, if found find_library(MATH_LIBRARY m) @@ -217,7 +227,6 @@ target_include_directories( xhplhip PUBLIC hip:device $ $ ) -# target_link_libraries( xhplhip roc::rocblas roc::rocrand ${BLAS_LIBRARIES} ${MPI_CXX_LIBRARIES} OpenMP::OpenMP_CXX /global/home/lulu/mun-node-3/blis-multi-thread/lib/zen3/libblis.so) target_link_libraries( xhplhip roc::rocblas roc::rocrand ${BLAS_LIBRARIES} ${MPI_CXX_LIBRARIES} OpenMP::OpenMP_CXX) configure_file( include/hplhip_config.hin ${CMAKE_CURRENT_SOURCE_DIR}/include/hplhip_config.h @ONLY NEWLINE_STYLE LF ) diff --git a/scripts/config/HPL_16GPU.dat b/scripts/config/HPL_16GPU.dat index 679c241..665aa5e 100644 --- a/scripts/config/HPL_16GPU.dat +++ b/scripts/config/HPL_16GPU.dat @@ -6,7 +6,7 @@ HPL.out output file name (if any) 364032 N 1 # of NBs 384 NBs -0 PMAP process mapping (0=Row-,1=Column-major) +1 PMAP process mapping (0=Row-,1=Column-major) 1 # of process grids (P x Q) 4 Ps 4 Qs diff --git a/scripts/config/HPL_1GPU.dat b/scripts/config/HPL_1GPU.dat index ee8e10a..3413c87 100644 --- a/scripts/config/HPL_1GPU.dat +++ b/scripts/config/HPL_1GPU.dat @@ -6,7 +6,7 @@ HPL.out output file name (if any) 91008 N 1 # of NBs 384 NBs -0 PMAP process mapping (0=Row-,1=Column-major) +1 PMAP process mapping (0=Row-,1=Column-major) 1 # of process grids (P x Q) 1 Ps 1 Qs diff --git a/scripts/config/HPL_2GPU.dat b/scripts/config/HPL_2GPU.dat index 4897416..0362234 100644 --- a/scripts/config/HPL_2GPU.dat +++ b/scripts/config/HPL_2GPU.dat @@ -6,7 +6,7 @@ HPL.out output file name (if any) 128256 N 1 # of NBs 384 NBs -0 PMAP process mapping (0=Row-,1=Column-major) +1 PMAP process mapping (0=Row-,1=Column-major) 1 # of process grids (P x Q) 2 Ps 1 Qs diff --git a/scripts/config/HPL_32GPU.dat b/scripts/config/HPL_32GPU.dat index cf14244..43fb13a 100644 --- a/scripts/config/HPL_32GPU.dat +++ b/scripts/config/HPL_32GPU.dat @@ -6,7 +6,7 @@ HPL.out output file name (if any) 513024 N 1 # of NBs 384 NBs -0 PMAP process mapping (0=Row-,1=Column-major) +1 PMAP process mapping (0=Row-,1=Column-major) 1 # of process grids (P x Q) 8 Ps 4 Qs diff --git a/scripts/config/HPL_4GPU.dat b/scripts/config/HPL_4GPU.dat index 6eb090a..e336bbd 100644 --- a/scripts/config/HPL_4GPU.dat +++ b/scripts/config/HPL_4GPU.dat @@ -6,7 +6,7 @@ HPL.out output file name (if any) 180864 N 1 # of NBs 384 NBs -0 PMAP process mapping (0=Row-,1=Column-major) +1 PMAP process mapping (0=Row-,1=Column-major) 1 # of process grids (P x Q) 2 Ps 2 Qs diff --git a/scripts/config/HPL_8GPU.dat b/scripts/config/HPL_8GPU.dat index 575e1a9..3d3c3c6 100644 --- a/scripts/config/HPL_8GPU.dat +++ b/scripts/config/HPL_8GPU.dat @@ -6,7 +6,7 @@ HPL.out output file name (if any) 256512 N 1 # of NBs 384 NBs -0 PMAP process mapping (0=Row-,1=Column-major) +1 PMAP process mapping (0=Row-,1=Column-major) 1 # of process grids (P x Q) 4 Ps 2 Qs @@ -19,7 +19,7 @@ HPL.out output file name (if any) 2 NDIVs 1 # of recursive panel fact. 2 RFACTs (0=left, 1=Crout, 2=Right) -8 # of broadcast +1 # of broadcast 6 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM,6=ibcast,7=BiDir) 1 # of lookahead depth 1 DEPTHs (>=0) diff --git a/scripts/env.mun.sh b/scripts/env.mun.sh new file mode 100755 index 0000000..5ef304e --- /dev/null +++ b/scripts/env.mun.sh @@ -0,0 +1,7 @@ +module reset + +module load rocm/5.3.0-10584 + +# export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/global/software/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.4.0/openblas-0.3.20-qbm5uv3ntjerkx4jzrprmelytviwoq2e/lib:/global/software/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.4.0/openmpi-4.1.4-3z7jsddbvczl4duixalzrtap3q5nuvjk/lib" +export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/global/home/lulu/blis/lib:/global/home/lulu/ompi/lib:/global/software_internal/rocm/rocm-5.3.0-10584/lib" +# export MPICH_GPU_SUPPORT_ENABLED=1 diff --git a/scripts/mpirun_xhplhip.sh b/scripts/mpirun_xhplhip.sh index 617143a..8e0fc76 100644 --- a/scripts/mpirun_xhplhip.sh +++ b/scripts/mpirun_xhplhip.sh @@ -7,6 +7,9 @@ mpi_bin=${mpi_dir}/bin/mpiexec mpi_lib=${mpi_dir}/lib hpl_runscript=./run_xhplhip.sh +if [ -z "${ROCM_PATH}" ]; then rocm_dir="/opt/rocm/lib"; +else rocm_dir="${ROCM_PATH}/lib"; fi + filename=HPL.dat P=$(sed -n "11, 1p" ${filename} | awk '{print $1}') @@ -18,9 +21,9 @@ num_cpu_sockets=$(lscpu | grep Socket | awk '{print $2}') total_cpu_cores=$(($num_cpu_cores*$num_cpu_sockets)) export LD_LIBRARY_PATH=${mpi_lib}:$LD_LIBRARY_PATH -export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH="${rocm_dir}":$LD_LIBRARY_PATH #Default MPI options -mpi_args="--map-by slot:PE=${total_cpu_cores} --bind-to core:overload-allowed --mca btl ^openib --mca pml ucx -x LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib ${mpi_args}" +mpi_args="--map-by slot:PE=${total_cpu_cores} --bind-to core:overload-allowed --mca btl ^openib --mca pml ucx -x LD_LIBRARY_PATH="${rocm_dir}/lib":$LD_LIBRARY_PATH ${mpi_args}" ${mpi_bin} --allow-run-as-root -np ${np} ${mpi_args} ${hpl_runscript} # ${mpi_bin} --hostfile hostfile --allow-run-as-root -np ${np} ${mpi_args} ${hpl_runscript} diff --git a/scripts/run_hpl.slurm b/scripts/run_hpl.slurm index 381779b..7e8f995 100755 --- a/scripts/run_hpl.slurm +++ b/scripts/run_hpl.slurm @@ -5,14 +5,16 @@ #SBATCH -c 8 #SBATCH -t 2:00:00 ##SBATCH -A VEN114 -#SBATCH -A project_462000075 #SBATCH -J xhplhip #SBATCH --gpu-bind=closest #SBATCH --ntasks-per-node=8 #SBATCH --gpus-per-node=8 +#SBATCH -p MI250-x4-IB +#SBATCH -w mun-node-4 +#SBATCH --exclusive #source ../env/env.crusher.sh -source ../env/env.lumi.sh +source env.mun.sh NP=$SLURM_NPROCS NODES=$SLURM_NNODES @@ -21,20 +23,24 @@ LOG=log.hpl-gpu-${NP}np-${HOSTNAME}-${DATE}.txt cp config/HPL_${NP}GPU.dat HPL.dat -EXE="../build/xhplhip" -CMD="" -CMD+="srun " -CMD+="-v " -CMD+="-n $NP " -CMD+="-N $NODES " -CMD+="-A VEN114 " -CMD+="--gpu-bind=closest " -CMD+="--ntasks-per-node=8 " -CMD+="--gpus-per-node=8 " -CMD+="-c 8 " -CMD+="-o $LOG -e $LOG " -#CMD+="${HOME}/mpich_bind.sh " -CMD+="$EXE" +# EXE="../build/xhplhip" +# CMD="" +# CMD+="srun " +# CMD+="-v " +# CMD+="-n $NP " +# CMD+="-N $NODES " +# CMD+="-A VEN114 " +# CMD+="--gpu-bind=closest " +# CMD+="--ntasks-per-node=8 " +# CMD+="--gpus-per-node=8 " +# CMD+="-c 8 " +# CMD+="-o $LOG -e $LOG " +# #CMD+="${HOME}/mpich_bind.sh " +# CMD+="$EXE" + +sh mpirun_xhplhip.sh + + if [ $NODES -gt 8 ]; then echo "export FI_MR_CACHE_MAX_COUNT=0" diff --git a/scripts/run_xhplhip.sh b/scripts/run_xhplhip.sh index ea4d314..c48d2f8 100755 --- a/scripts/run_xhplhip.sh +++ b/scripts/run_xhplhip.sh @@ -2,8 +2,9 @@ # set -x #echo on hpl_bin=./xhplhip -rocblas_dir=/opt/rocm-5.2.0/lib -blas_dir=/global/home/lulu/hyc/rocHPL-main/tpl/blis/lib +if [ -z "${ROCM_PATH}" ]; then rocblas_dir="/opt/rocm/lib" +else rocblas_dir="${ROCM_PATH}/lib"; fi +blas_dir="${BLAS_DIR}/lib/zen3" filename=./HPL.dat p=-1 @@ -229,4 +230,4 @@ if [[ $globalRank -lt $size ]]; then echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] CPU Cores: $omp_num_threads - $places" fi #run -${hpl_bin} +${hpl_bin} -p ${p} -q ${q} diff --git a/src/grid/HPL_grid_init.cpp b/src/grid/HPL_grid_init.cpp index 2130c0a..7c20055 100644 --- a/src/grid/HPL_grid_init.cpp +++ b/src/grid/HPL_grid_init.cpp @@ -140,7 +140,7 @@ int HPL_grid_init local_mycol = local_rank % q; local_myrow = local_rank / q; int noderow = node / (NPCOL / q); int nodecol = node % (NPCOL / q); myrow = noderow * p + local_myrow; mycol = nodecol * q + local_mycol; - myrow = rank / NPCOL; mycol = rank - myrow * NPCOL; + // myrow = rank / NPCOL; mycol = rank - myrow * NPCOL; } else { diff --git a/testing/backend/HPL_backendHIP.cpp b/testing/backend/HPL_backendHIP.cpp index 36b07ee..b425d75 100644 --- a/testing/backend/HPL_backendHIP.cpp +++ b/testing/backend/HPL_backendHIP.cpp @@ -35,6 +35,7 @@ void HIP::init(const HPL_T_grid* GRID) HIP_CHECK_ERROR(hipGetDeviceCount(&count)); //TODO: set dynamic device id int device_id = localRank % count; + // printf("host: %s, device id: %d, myrow: %d, mycol: %d\n", host_name, device_id, GRID->local_myrow, GRID->local_mycol); HIP_CHECK_ERROR(hipSetDevice(device_id)); // Get device properties @@ -1653,7 +1654,7 @@ void HIP::panel_init(HPL_T_grid *GRID, HPL_T_palg *ALGO, const int M, const int */ PANEL->ldl2 = 0; /* local leading dim of array L2 */ PANEL->dldl2 = 0; /* local leading dim of array L2 */ - PANEL->dldl1 = 1.02 * A->dN; // padding + PANEL->dldl1 = 1.015 * A->dN; // padding to avoid stack overflow PANEL->len = 0; /* length of the buffer to broadcast */ PANEL->nu0 = 0; PANEL->nu1 = 0; @@ -1666,9 +1667,10 @@ void HIP::panel_init(HPL_T_grid *GRID, HPL_T_palg *ALGO, const int M, const int /*Split fraction*/ const double fraction = 0.6; - if ((double)M / A->dN > 0.97) { + // get the panel init time + if ((double)M / A->dN > 0.985) { HPL_ptimer_boot(); - HPL_ptimer( 0 ); + HPL_ptimer( HPL_rzero ); } dalign = ALGO->align * sizeof(double); size_t lpiv = (5 * JB * sizeof(int) + sizeof(double) - 1) / (sizeof(double)); diff --git a/testing/ptest/HPL_pddriver.cpp b/testing/ptest/HPL_pddriver.cpp index f942e8e..3ddc5c0 100644 --- a/testing/ptest/HPL_pddriver.cpp +++ b/testing/ptest/HPL_pddriver.cpp @@ -220,7 +220,7 @@ int main( ARGC, ARGV ) algo.fswap = fswap; algo.fsthr = tswap; algo.equil = equil; algo.align = align; - + HPL_pdtest( &test, &grid, &algo, nval[in], nbval[inb] ); } diff --git a/testing/ptest/HPL_pdtest.cpp b/testing/ptest/HPL_pdtest.cpp index afea16a..e25ac5f 100644 --- a/testing/ptest/HPL_pdtest.cpp +++ b/testing/ptest/HPL_pdtest.cpp @@ -230,6 +230,7 @@ void HPL_pdtest (void) vsip_blockrelease_d( mat.block, VSIP_TRUE ); vsip_blockdestroy_d( mat.block ); #endif + /* * Gather max of all CPU and WALL clock timings and print timing results */ @@ -510,7 +511,6 @@ void HPL_pdtest HPL_fprintf( TEST->outfp, "%s%16.8e%s%s\n", "||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)= ", resid1, " ...... ", ( resid1 < TEST->thrsh ? "PASSED" : "FAILED" ) ); - if( resid1 >= TEST->thrsh ) { HPL_fprintf( TEST->outfp, "%s%18.6f\n", From 3bc70c6a8c6dba627ab665740b8a50cce831cee2 Mon Sep 17 00:00:00 2001 From: Redtorm Date: Mon, 12 Sep 2022 10:10:06 +0200 Subject: [PATCH 3/3] add slurm running script for lumi --- HPL.dat | 2 +- scripts/run_hpl_lumi.slurm | 56 ++++++++++++++++++++ scripts/{run_hpl.slurm => run_hpl_mun.slurm} | 5 +- 3 files changed, 58 insertions(+), 5 deletions(-) create mode 100644 scripts/run_hpl_lumi.slurm rename scripts/{run_hpl.slurm => run_hpl_mun.slurm} (95%) diff --git a/HPL.dat b/HPL.dat index 3f8575c..4033a9a 100755 --- a/HPL.dat +++ b/HPL.dat @@ -3,7 +3,7 @@ Innovative Computing Laboratory, University of Tennessee HPL.out output file name (if any) 0 device out (6=stdout,7=stderr,file) 1 # of problems sizes (N) -256000 N +256128 N 1 # of NBs 384 NBs 1 PMAP process mapping (0=Row-,1=Column-major) diff --git a/scripts/run_hpl_lumi.slurm b/scripts/run_hpl_lumi.slurm new file mode 100644 index 0000000..a6f2d28 --- /dev/null +++ b/scripts/run_hpl_lumi.slurm @@ -0,0 +1,56 @@ +#!/bin/bash +#SBATCH -v +#SBATCH -N 2 +#SBATCH -n 16 +#SBATCH -c 8 +#SBATCH -t 1:00:00 +#SBATCH -A VEN114 +#SBATCH -J xhplhip +#SBATCH --gpu-bind=closest +#SBATCH --job-name=hpl_gpu # Job name +#SBATCH --output=hpl.o%j # Name of stdout output file +#SBATCH --error=hpl.e%j # Name of stderr error file +#SBATCH --partition=gpu # Partition (queue) name +#SBATCH --ntasks-per-node=8 +#SBATCH --gpus-per-node=8 +#SBATCH --time=0-01:00:00 # Run time (d-hh:mm:ss) +#SBATCH --account=project_462000075 # Project for billing +#SBATCH --exclusive + +source ../env/env.lumi.sh + +export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}" +export MPICH_GPU_SUPPORT_ENABLED=1 + +NP=$SLURM_NPROCS +NODES=$SLURM_NNODES +DATE=$(date +%y%m%d-%H%M%S) +LOG=log.hpl-gpu-${NP}np-${HOSTNAME}-${DATE}.txt + +cp config/HPL_${NP}GPU.dat HPL.dat + +EXE="./xhplhip -p 4 -q 2" +CMD="" +CMD+="srun " +CMD+="-v " +CMD+="-n $NP " +CMD+="-N $NODES " +# CMD+="-A VEN114 " +CMD+="--gpu-bind=closest " +CMD+="--ntasks-per-node=8 " +CMD+="--gpus-per-node=8 " +CMD+="--exclusive " +CMD+="-c 8 " +CMD+="-o $LOG -e $LOG " +#CMD+="${HOME}/mpich_bind.sh " +CMD+="$EXE" + +#export MPICH_SMP_SINGLE_COPY_MODE=NONE # does not work +export FI_MR_CACHE_MAX_COUNT=0 +export MPICH_RANK_REORDER_DISPLAY=1 + +echo $CMD >> $LOG +echo $CMD 2>&1 | tee -a $LOG + $CMD 2>&1 | tee -a $LOG +cat HPL.dat 2>&1 | tee -a $LOG +cat HPL.out 2>&1 | tee -a $LOG \ No newline at end of file diff --git a/scripts/run_hpl.slurm b/scripts/run_hpl_mun.slurm similarity index 95% rename from scripts/run_hpl.slurm rename to scripts/run_hpl_mun.slurm index 7e8f995..2b59355 100755 --- a/scripts/run_hpl.slurm +++ b/scripts/run_hpl_mun.slurm @@ -13,7 +13,6 @@ #SBATCH -w mun-node-4 #SBATCH --exclusive -#source ../env/env.crusher.sh source env.mun.sh NP=$SLURM_NPROCS @@ -38,9 +37,7 @@ cp config/HPL_${NP}GPU.dat HPL.dat # #CMD+="${HOME}/mpich_bind.sh " # CMD+="$EXE" -sh mpirun_xhplhip.sh - - +bash mpirun_xhplhip.sh if [ $NODES -gt 8 ]; then echo "export FI_MR_CACHE_MAX_COUNT=0"