Skip to content

Commit

Permalink
Refined SYCL over CUDA/HIP and removed possibility of minus input for…
Browse files Browse the repository at this point in the history
… MPI setting
  • Loading branch information
do-jason committed Dec 3, 2023
1 parent 14257a8 commit 6bba191
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 12 deletions.
12 changes: 6 additions & 6 deletions README_sycl.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ $ {Run 2D/3D/refinement application by replacing --gpu/--cpu with --gpu/--sycl/-
## Optional runtime environment variables

+ The below shell environment variables can be tested for more potential SYCL specific tuning. Setting it to "1" or "on" will enable these features.
+ `relionSyclUseCuda`: --sycl-cuda will be used even if --gpu/--sycl is specified in command lines
+ `relionSyclUseHip`: --sycl-hip will be used even if --gpu/--sycl is specified in command lines
+ `relionSyclUseCuda`: This with --gpu have the same meaning as --sycl-cuda. --sycl-cuda will be used even if --gpu/--sycl is specified in command lines.
+ `relionSyclUseHip`: This with --gpu have the same meaning as --sycl-hip. --sycl-hip will be used even if --gpu/--sycl is specified in command lines
+ `relionSyclUseInOrderQueue`: Use in-order SYCL queue. Without this, out-of-order SYCL queue is used by default. (experimental)
+ `relionSyclUseAsyncSubmission`: Remove wait() for each SYCL kernel submission. (experimental)
+ `relionSyclUseStream`: Create new in-order SYCL queue for each cudaStream. (experimental)
Expand All @@ -67,13 +67,13 @@ $ {Run 2D/3D/refinement application by replacing --gpu/--cpu with --gpu/--sycl/-

+ For CMake configuration
+ `SYCL`(=ON/OFF): Enable SYCL based acceleration build
+ `SYCL_CUDA_COMPILE`(=ON/OFF): Enable SYCL compilation for CUDA target
+ `SYCL_CUDA_TARGET`: SYCL CUDA arch target (i.e. 80)
+ `SYCL_HIP_COMPILE`(=ON/OFF): Enable SYCL compilation for HIP target
+ `SYCL_HIP_TARGET`: SYCL HIP arch target (i.e gfx90a)
+ `SyclForceOneDPL`(=ON/OFF): Use oneDPL(https://github.com/oneapi-src/oneDPL) if it can be used. This has the same effect as setting "-DUSE_ONEDPL" for CMAKE_CXX_FLAGS below. (experimental)
+ `SYCL_AOT_COMPILE`(=ON/OFF): Enable AOT(Ahead-Of-Time) compilation for SPIR64 target. Default target is pvc. (for future use)
+ `SYCL_AOT_TARGET`(=ON/OFF): Specify AOT(Ahead-Of-Time) SPIR64 target. Possible list can be checked using "ocloc compile --help" command. (for future use)
+ `SYCL_CUDA_COMPILE`(=ON/OFF): Enable SYCL compilation for CUDA target (Not tested)
+ `SYCL_CUDA_TARGET`: SYCL CUDA arch target (Not tested)
+ `SYCL_HIP_COMPILE`(=ON/OFF): Enable SYCL compilation for HIP target (Not tested)
+ `SYCL_HIP_TARGET`: SYCL HIP arch target (Not tested)
+ `SYCL_HOST_FLAGS`(=list of flags with space as separator): Additional flags for host compiler (for future use)
+ `SYCL_COMPILER_NAME`: SYCL compiler command name (for future use)
+ `SYCL_COMPILE_FLAGS`: Additional SYCL compile flags (for future use)
Expand Down
9 changes: 6 additions & 3 deletions src/apps/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -358,19 +358,22 @@ elseif(SYCL)
endif(SYCL_CUDA_COMPILE)
if(SYCL_HIP_COMPILE)
if(SYCL_CUDA_COMPILE)
set(REL_SYCL_COMPILE_TOOLCHAIN ${REL_SYCL_COMPILE_TOOLCHAIN} -DEIGEN_NO_HIP)
set(REL_SYCL_TARGETS ${REL_SYCL_TARGETS},amdgcn-amd-amdhsa)
else(SYCL_CUDA_COMPILE)
set(REL_SYCL_COMPILE_TOOLCHAIN ${REL_SYCL_COMPILE_TOOLCHAIN} -DEIGEN_NO_CUDA -DEIGEN_NO_HIP)
set(REL_SYCL_TARGETS amdgcn-amd-amdhsa)
endif(SYCL_CUDA_COMPILE)
# set(REL_SYCL_FRONTEND ${REL_SYCL_FRONTEND} -Xsycl-target-frontend=amdgcn-amd-amdhsa " -fp-model=fast")
set(REL_SYCL_BACKEND ${REL_SYCL_BACKEND} -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${SYCL_HIP_TARGET})
# set(REL_SYCL_BACKEND ${REL_SYCL_BACKEND} -Xsycl-target-backend=amdgcn-amd-amdhsa "--offload-arch=${SYCL_HIP_TARGET} -options -cl-fast-relaxed-math")
set(REL_SYCL_COMPILE_TOOLCHAIN ${REL_SYCL_COMPILE_TOOLCHAIN} -DEIGEN_NO_CUDA -DEIGEN_NO_HIP)
endif()
set(REL_SYCL_TARGETS ${REL_SYCL_TARGETS},spir64)
# set(REL_SYCL_FRONTEND ${REL_SYCL_FRONTEND} -Xsycl-target-frontend=spir64 "-fp-model=fast")
set(REL_SYCL_COMPILE_TOOLCHAIN ${REL_SYCL_COMPILE_TOOLCHAIN} -Wno-unused-command-line-argument -DSYCL_USE_NATIVE_FP_ATOMICS -fsycl -fsycl-unnamed-lambda -fsycl-targets=${REL_SYCL_TARGETS} ${REL_SYCL_FRONTEND} ${REL_SYCL_BACKEND})
target_compile_options(relion_sycl_util PRIVATE ${REL_SYCL_COMPILE_TOOLCHAIN})
# target_compile_options(relion_sycl_util PRIVATE ${REL_SYCL_COMPILE_TOOLCHAIN})
string(REPLACE ";" " " REL_SYCL_COMPILE_TOOLCHAIN "${REL_SYCL_COMPILE_TOOLCHAIN}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${REL_SYCL_COMPILE_TOOLCHAIN}" CACHE STRING "" FORCE)

else(DEFINED SYCL_CUDA_COMPILE OR DEFINED SYCL_HIP_COMPILE)
target_compile_options(relion_sycl_util PRIVATE -Wno-unused-command-line-argument -DSYCL_USE_NATIVE_FP_ATOMICS -fsycl -fsycl-unnamed-lambda -fsycl-targets=spir64)
Expand Down Expand Up @@ -456,7 +459,7 @@ foreach (_target ${RELION_TARGETS})
if(SYCL_CUDA_COMPILE)
set(REL_SYCL_TARGET_LINK -Wno-unknown-cuda-version)
endif(SYCL_CUDA_COMPILE)
set(REL_SYCL_TARGET_LINK ${REL_SYCL_TARGET_LINK} -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=${REL_SYCL_TARGETS},spir64 ${REL_SYCL_BACKEND})
set(REL_SYCL_TARGET_LINK ${REL_SYCL_TARGET_LINK} -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=${REL_SYCL_TARGETS} ${REL_SYCL_BACKEND})
else(DEFINED SYCL_CUDA_COMPILE OR DEFINED SYCL_HIP_COMPILE)
set(REL_SYCL_TARGET_LINK -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64)
# set(REL_SYCL_TARGET_LINK -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64 -Xsycl-target-backend=spir64 "-options -cl-fast-relaxed-math")
Expand Down
6 changes: 3 additions & 3 deletions src/mpi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,9 @@ MpiNode::MpiNode(int &argc, char ** argv)
char* pEnvP2P = std::getenv("MAX_MPI_P2P_BLOCK"); // For pt-2-pt blocking size
char* pEnvColl = std::getenv("MAX_MPI_COLL_BLOCK");// For collective blocking size

const std::ptrdiff_t lBlock = (pEnvBlock == NULL) ? 0 : std::strtoul(pEnvBlock, &pEnvBlock, 10);
const std::ptrdiff_t lP2P = (pEnvP2P == NULL) ? 0 : std::strtoul(pEnvP2P, &pEnvP2P, 10);
const std::ptrdiff_t lColl = (pEnvColl == NULL) ? 0 : std::strtoul(pEnvColl, &pEnvColl, 10);
const std::ptrdiff_t lBlock = (pEnvBlock == NULL) ? 0 : std::strtoul(pEnvBlock, nullptr, 10);
const std::ptrdiff_t lP2P = (pEnvP2P == NULL) ? 0 : std::strtoul(pEnvP2P, nullptr, 10);
const std::ptrdiff_t lColl = (pEnvColl == NULL) ? 0 : std::strtoul(pEnvColl, nullptr, 10);

if (lBlock > 0) // "MAX_MPI_BLOCK" has precedence if it is set
{
Expand Down

0 comments on commit 6bba191

Please sign in to comment.