Skip to content

Commit

Permalink
Merge pull request reger-men#23 from lueelu/master
Browse files Browse the repository at this point in the history
Further optimization for HPL_GPU
  • Loading branch information
reger-men authored Sep 12, 2022
2 parents 0cc18d3 + 3bc70c6 commit e35c3e3
Show file tree
Hide file tree
Showing 26 changed files with 1,236 additions and 970 deletions.
41 changes: 25 additions & 16 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,18 @@ EXECUTE_PROCESS(
find_package(OpenMP)

# Add rocM root dir to CMAKE_PREFIX_PATH, usually /opt/rocm
list(APPEND CMAKE_PREFIX_PATH "/opt/rocm")
include(/opt/rocm/hip/cmake/FindHIP.cmake)
include(/opt/rocm/share/rocm/cmake/ROCMCheckTargetIds.cmake)
find_package( hip REQUIRED )
if(NOT DEFINED ROCM_PATH)
if(DEFINED ENV{ROCM_PATH})
set(ROCM_PATH $ENV{ROCM_PATH})
else()
set(ROCM_PATH "/opt/rocm")
endif()
endif()
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
set(CMAKE_MODULE_PATH "${ROCM_PATH}/hip/cmake" ${CMAKE_MODULE_PATH})
set(CMAKE_MODULE_PATH "${ROCM_PATH}/share/rocm/cmake" ${CMAKE_MODULE_PATH})
include(ROCMCheckTargetIds)
find_package( HIP REQUIRED )
find_package( rocblas REQUIRED )

# switch compiler and linker on non-Windows
Expand Down Expand Up @@ -83,15 +91,6 @@ else ()
endif ()
endif ()

# find_library(BLAS_LIBRARIES NAMES blis
# PATHS ${BLAS_DIR}
# NO_DEFAULT_PATH)
# if (BLAS_LIBRARIES)
# message(STATUS "Found BLAS: ${BLAS_LIBRARIES}")
# else()
# find_package(BLAS REQUIRED)
# endif()

if(NOT DEFINED BLAS_DIR)
if(DEFINED ENV{BLAS_DIR})
set(BLAS_DIR $ENV{BLAS_DIR})
Expand All @@ -100,7 +99,18 @@ if(NOT DEFINED BLAS_DIR)
else()
list(APPEND CMAKE_PREFIX_PATH ${BLAS_DIR})
endif()
find_package( BLAS REQUIRED )

find_library(BLAS_LIBRARIES NAMES blis openblas
PATHS ${BLAS_DIR}
HINTS ${BLAS_DIR}/lib/zen3 ${BLAS_DIR}/lib
NO_DEFAULT_PATH)
if (BLAS_LIBRARIES)
message(STATUS "Found BLAS: ${BLAS_LIBRARIES}")
else()
message(STATUS "BLAS NOT Found: ${BLAS_LIBRARIES}")
find_package(BLAS REQUIRED)
endif()
# find_package( BLAS REQUIRED )

# append math library, if found
find_library(MATH_LIBRARY m)
Expand Down Expand Up @@ -212,12 +222,11 @@ hip_add_executable( xhplhip ${hpl_device_source} ${hpl_host_source})

target_compile_options(xhplhip PRIVATE ${CMAKE_HOST_FLAGS})
target_include_directories( xhplhip PUBLIC hip:device
${HIP_ROOT_DIR}/include
# ${HIP_ROOT_DIR}/include
${HPLHIP_DEVRAND_INCLUDE_DIRS}
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>
)
# target_link_libraries( xhplhip roc::rocblas roc::rocrand ${BLAS_LIBRARIES} ${MPI_CXX_LIBRARIES} /global/home/lulu/mun-node-3/blis-multi-thread/lib/zen3/libblis.so)
target_link_libraries( xhplhip roc::rocblas roc::rocrand ${BLAS_LIBRARIES} ${MPI_CXX_LIBRARIES} OpenMP::OpenMP_CXX)

configure_file( include/hplhip_config.hin ${CMAKE_CURRENT_SOURCE_DIR}/include/hplhip_config.h @ONLY NEWLINE_STYLE LF )
Expand Down
8 changes: 4 additions & 4 deletions HPL.dat
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@ Innovative Computing Laboratory, University of Tennessee
HPL.out output file name (if any)
0 device out (6=stdout,7=stderr,file)
1 # of problems sizes (N)
256000 N
256128 N
1 # of NBs
384 NBs
0 PMAP process mapping (0=Row-,1=Column-major)
1 PMAP process mapping (0=Row-,1=Column-major)
1 # of process grids (P x Q)
2 Ps
4 Qs
4 Ps
2 Qs
16.0 threshold
1 # of panel fact
2 PFACTs (0=left, 1=Crout, 2=Right)
Expand Down
6 changes: 3 additions & 3 deletions include/backend/hpl_backendHIP.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

#include <hip/hip_runtime.h>
#if defined(HPLHIP_USE_ROCRAND)
#include <rocrand.h>
#include <rocrand/rocrand.h>
#endif
#include <rocblas.h>
#include <rocblas/rocblas.h>

#include <cstdio>
#include <cstdlib>
Expand Down Expand Up @@ -81,7 +81,7 @@ enum SWP_PHASE {
};

namespace HIP {
void init(size_t);
void init(const HPL_T_grid*);
void release();

void malloc(void**, size_t);
Expand Down
2 changes: 1 addition & 1 deletion include/backend/hpl_backendWrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ UPDATE_LOOK_AHEAD, L1TRANSFER, L2TRANSFER, DGEMMSTART, DGEMMSTOP, UPDATE, SWAPST
enum HPL_STREAM {HPL_COMPUTESTREAM, HPL_DATASTREAM, HPL_PDLASWPSTREAM};


void HPL_BE_init(size_t, enum HPL_TARGET);
void HPL_BE_init(const HPL_T_grid*, enum HPL_TARGET);

void HPL_BE_malloc(void**, size_t, enum HPL_TARGET);

Expand Down
6 changes: 6 additions & 0 deletions include/hpl_grid.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ typedef struct HPL_S_grid
int col_hdim; /* col_ip2 procs hypercube dimension */
int col_ip2m1; /* largest power of two <= npcol-1 */
int col_mask; /* col_ip2m1 procs hypercube mask */
int local_myrow;
int local_mycol;
int local_nprow;
int local_npcol;
} HPL_T_grid;

/*
Expand Down Expand Up @@ -132,6 +136,8 @@ STDC_ARGS( (
const HPL_T_ORDER,
const int,
const int,
const int,
const int,
HPL_T_grid *
) );
int HPL_grid_exit
Expand Down
1 change: 1 addition & 0 deletions include/hpl_panel.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ typedef struct HPL_S_panel
int dlda;
double* U2; /* ptr to U2 */
double* dU2;
int dldl1;
int dldl2;
int ldu1;
int ldu2;
Expand Down
1 change: 1 addition & 0 deletions include/hpl_pgesv.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ typedef struct HPL_S_pmat
double * d_X; /* device pointer to solution vector */
double * W;
double * dW;
int dN;
} HPL_T_pmat;
/*
* ---------------------------------------------------------------------
Expand Down
4 changes: 4 additions & 0 deletions include/hpl_ptest.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ typedef struct HPL_S_test
*/
void HPL_pdinfo
STDC_ARGS( (
int,
char**,
HPL_T_test *,
int *,
int *,
Expand All @@ -118,6 +120,8 @@ STDC_ARGS( (
int *,
int *,
int *,
int *,
int *,
HPL_T_FACT *,
int *,
int *,
Expand Down
2 changes: 1 addition & 1 deletion scripts/config/HPL_16GPU.dat
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ HPL.out output file name (if any)
364032 N
1 # of NBs
384 NBs
0 PMAP process mapping (0=Row-,1=Column-major)
1 PMAP process mapping (0=Row-,1=Column-major)
1 # of process grids (P x Q)
4 Ps
4 Qs
Expand Down
2 changes: 1 addition & 1 deletion scripts/config/HPL_1GPU.dat
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ HPL.out output file name (if any)
91008 N
1 # of NBs
384 NBs
0 PMAP process mapping (0=Row-,1=Column-major)
1 PMAP process mapping (0=Row-,1=Column-major)
1 # of process grids (P x Q)
1 Ps
1 Qs
Expand Down
2 changes: 1 addition & 1 deletion scripts/config/HPL_2GPU.dat
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ HPL.out output file name (if any)
128256 N
1 # of NBs
384 NBs
0 PMAP process mapping (0=Row-,1=Column-major)
1 PMAP process mapping (0=Row-,1=Column-major)
1 # of process grids (P x Q)
2 Ps
1 Qs
Expand Down
2 changes: 1 addition & 1 deletion scripts/config/HPL_32GPU.dat
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ HPL.out output file name (if any)
513024 N
1 # of NBs
384 NBs
0 PMAP process mapping (0=Row-,1=Column-major)
1 PMAP process mapping (0=Row-,1=Column-major)
1 # of process grids (P x Q)
8 Ps
4 Qs
Expand Down
2 changes: 1 addition & 1 deletion scripts/config/HPL_4GPU.dat
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ HPL.out output file name (if any)
180864 N
1 # of NBs
384 NBs
0 PMAP process mapping (0=Row-,1=Column-major)
1 PMAP process mapping (0=Row-,1=Column-major)
1 # of process grids (P x Q)
2 Ps
2 Qs
Expand Down
4 changes: 2 additions & 2 deletions scripts/config/HPL_8GPU.dat
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ HPL.out output file name (if any)
256512 N
1 # of NBs
384 NBs
0 PMAP process mapping (0=Row-,1=Column-major)
1 PMAP process mapping (0=Row-,1=Column-major)
1 # of process grids (P x Q)
4 Ps
2 Qs
Expand All @@ -19,7 +19,7 @@ HPL.out output file name (if any)
2 NDIVs
1 # of recursive panel fact.
2 RFACTs (0=left, 1=Crout, 2=Right)
8 # of broadcast
1 # of broadcast
6 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM,6=ibcast,7=BiDir)
1 # of lookahead depth
1 DEPTHs (>=0)
Expand Down
7 changes: 7 additions & 0 deletions scripts/env.mun.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
module reset

module load rocm/5.3.0-10584

# export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/global/software/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.4.0/openblas-0.3.20-qbm5uv3ntjerkx4jzrprmelytviwoq2e/lib:/global/software/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.4.0/openmpi-4.1.4-3z7jsddbvczl4duixalzrtap3q5nuvjk/lib"
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/global/home/lulu/blis/lib:/global/home/lulu/ompi/lib:/global/software_internal/rocm/rocm-5.3.0-10584/lib"
# export MPICH_GPU_SUPPORT_ENABLED=1
7 changes: 5 additions & 2 deletions scripts/mpirun_xhplhip.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ mpi_bin=${mpi_dir}/bin/mpiexec
mpi_lib=${mpi_dir}/lib
hpl_runscript=./run_xhplhip.sh

if [ -z "${ROCM_PATH}" ]; then rocm_dir="/opt/rocm/lib";
else rocm_dir="${ROCM_PATH}/lib"; fi

filename=HPL.dat

P=$(sed -n "11, 1p" ${filename} | awk '{print $1}')
Expand All @@ -18,9 +21,9 @@ num_cpu_sockets=$(lscpu | grep Socket | awk '{print $2}')
total_cpu_cores=$(($num_cpu_cores*$num_cpu_sockets))

export LD_LIBRARY_PATH=${mpi_lib}:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH="${rocm_dir}":$LD_LIBRARY_PATH
#Default MPI options
mpi_args="--map-by slot:PE=${total_cpu_cores} --bind-to core:overload-allowed --mca btl ^openib --mca pml ucx --report-bindings -x LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib ${mpi_args}"
mpi_args="--map-by slot:PE=${total_cpu_cores} --bind-to core:overload-allowed --mca btl ^openib --mca pml ucx -x LD_LIBRARY_PATH="${rocm_dir}/lib":$LD_LIBRARY_PATH ${mpi_args}"

${mpi_bin} --allow-run-as-root -np ${np} ${mpi_args} ${hpl_runscript}
# ${mpi_bin} --hostfile hostfile --allow-run-as-root -np ${np} ${mpi_args} ${hpl_runscript}
Expand Down
56 changes: 56 additions & 0 deletions scripts/run_hpl_lumi.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/bash
#SBATCH -v
#SBATCH -N 2
#SBATCH -n 16
#SBATCH -c 8
#SBATCH -t 1:00:00
#SBATCH -A VEN114
#SBATCH -J xhplhip
#SBATCH --gpu-bind=closest
#SBATCH --job-name=hpl_gpu # Job name
#SBATCH --output=hpl.o%j # Name of stdout output file
#SBATCH --error=hpl.e%j # Name of stderr error file
#SBATCH --partition=gpu # Partition (queue) name
#SBATCH --ntasks-per-node=8
#SBATCH --gpus-per-node=8
#SBATCH --time=0-01:00:00 # Run time (d-hh:mm:ss)
#SBATCH --account=project_462000075 # Project for billing
#SBATCH --exclusive

source ../env/env.lumi.sh

export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"
export MPICH_GPU_SUPPORT_ENABLED=1

NP=$SLURM_NPROCS
NODES=$SLURM_NNODES
DATE=$(date +%y%m%d-%H%M%S)
LOG=log.hpl-gpu-${NP}np-${HOSTNAME}-${DATE}.txt

cp config/HPL_${NP}GPU.dat HPL.dat

EXE="./xhplhip -p 4 -q 2"
CMD=""
CMD+="srun "
CMD+="-v "
CMD+="-n $NP "
CMD+="-N $NODES "
# CMD+="-A VEN114 "
CMD+="--gpu-bind=closest "
CMD+="--ntasks-per-node=8 "
CMD+="--gpus-per-node=8 "
CMD+="--exclusive "
CMD+="-c 8 "
CMD+="-o $LOG -e $LOG "
#CMD+="${HOME}/mpich_bind.sh "
CMD+="$EXE"

#export MPICH_SMP_SINGLE_COPY_MODE=NONE # does not work
export FI_MR_CACHE_MAX_COUNT=0
export MPICH_RANK_REORDER_DISPLAY=1

echo $CMD >> $LOG
echo $CMD 2>&1 | tee -a $LOG
$CMD 2>&1 | tee -a $LOG
cat HPL.dat 2>&1 | tee -a $LOG
cat HPL.out 2>&1 | tee -a $LOG
37 changes: 20 additions & 17 deletions scripts/run_hpl.slurm → scripts/run_hpl_mun.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@
#SBATCH -c 8
#SBATCH -t 2:00:00
##SBATCH -A VEN114
#SBATCH -A project_462000075
#SBATCH -J xhplhip
#SBATCH --gpu-bind=closest
#SBATCH --ntasks-per-node=8
#SBATCH --gpus-per-node=8
#SBATCH -p MI250-x4-IB
#SBATCH -w mun-node-4
#SBATCH --exclusive

#source ../env/env.crusher.sh
source ../env/env.lumi.sh
source env.mun.sh

NP=$SLURM_NPROCS
NODES=$SLURM_NNODES
Expand All @@ -21,20 +22,22 @@ LOG=log.hpl-gpu-${NP}np-${HOSTNAME}-${DATE}.txt

cp config/HPL_${NP}GPU.dat HPL.dat

EXE="../build/xhplhip"
CMD=""
CMD+="srun "
CMD+="-v "
CMD+="-n $NP "
CMD+="-N $NODES "
CMD+="-A VEN114 "
CMD+="--gpu-bind=closest "
CMD+="--ntasks-per-node=8 "
CMD+="--gpus-per-node=8 "
CMD+="-c 8 "
CMD+="-o $LOG -e $LOG "
#CMD+="${HOME}/mpich_bind.sh "
CMD+="$EXE"
# EXE="../build/xhplhip"
# CMD=""
# CMD+="srun "
# CMD+="-v "
# CMD+="-n $NP "
# CMD+="-N $NODES "
# CMD+="-A VEN114 "
# CMD+="--gpu-bind=closest "
# CMD+="--ntasks-per-node=8 "
# CMD+="--gpus-per-node=8 "
# CMD+="-c 8 "
# CMD+="-o $LOG -e $LOG "
# #CMD+="${HOME}/mpich_bind.sh "
# CMD+="$EXE"

bash mpirun_xhplhip.sh

if [ $NODES -gt 8 ]; then
echo "export FI_MR_CACHE_MAX_COUNT=0"
Expand Down
Loading

0 comments on commit e35c3e3

Please sign in to comment.