Refined SYCL over CUDA/HIP and removed possibility of minus input for…

… MPI setting
3dem · Dec 3, 2023 · 6bba191 · 6bba191
1 parent 14257a8
commit 6bba191
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 12 deletions.
diff --git a/README_sycl.md b/README_sycl.md
@@ -49,8 +49,8 @@ $ {Run 2D/3D/refinement application by replacing --gpu/--cpu with --gpu/--sycl/-
 ## Optional runtime environment variables
 
 + The below shell environment variables can be tested for more potential SYCL specific tuning. Setting it to "1" or "on" will enable these features.
-	+ `relionSyclUseCuda`: --sycl-cuda will be used even if --gpu/--sycl is specified in command lines
-	+ `relionSyclUseHip`: --sycl-hip will be used even if --gpu/--sycl is specified in command lines
+	+ `relionSyclUseCuda`: This with --gpu have the same meaning as --sycl-cuda. --sycl-cuda will be used even if --gpu/--sycl is specified in command lines.
+	+ `relionSyclUseHip`: This with --gpu have the same meaning as --sycl-hip. --sycl-hip will be used even if --gpu/--sycl is specified in command lines
 	+ `relionSyclUseInOrderQueue`: Use in-order SYCL queue. Without this, out-of-order SYCL queue is used by default. (experimental)
 	+ `relionSyclUseAsyncSubmission`: Remove wait() for each SYCL kernel submission. (experimental)
 	+ `relionSyclUseStream`: Create new in-order SYCL queue for each cudaStream. (experimental)
@@ -67,13 +67,13 @@ $ {Run 2D/3D/refinement application by replacing --gpu/--cpu with --gpu/--sycl/-
 
 + For CMake configuration
 	+ `SYCL`(=ON/OFF): Enable SYCL based acceleration build
+	+ `SYCL_CUDA_COMPILE`(=ON/OFF): Enable SYCL compilation for CUDA target
+	+ `SYCL_CUDA_TARGET`: SYCL CUDA arch target (i.e. 80)
+	+ `SYCL_HIP_COMPILE`(=ON/OFF): Enable SYCL compilation for HIP target
+	+ `SYCL_HIP_TARGET`: SYCL HIP arch target (i.e gfx90a)
 	+ `SyclForceOneDPL`(=ON/OFF): Use oneDPL(https://github.com/oneapi-src/oneDPL) if it can be used. This has the same effect as setting "-DUSE_ONEDPL" for CMAKE_CXX_FLAGS below. (experimental)
 	+ `SYCL_AOT_COMPILE`(=ON/OFF): Enable AOT(Ahead-Of-Time) compilation for SPIR64 target. Default target is pvc. (for future use)
 	+ `SYCL_AOT_TARGET`(=ON/OFF): Specify AOT(Ahead-Of-Time) SPIR64 target. Possible list can be checked using "ocloc compile --help" command. (for future use)
-	+ `SYCL_CUDA_COMPILE`(=ON/OFF): Enable SYCL compilation for CUDA target (Not tested)
-	+ `SYCL_CUDA_TARGET`: SYCL CUDA arch target (Not tested)
-	+ `SYCL_HIP_COMPILE`(=ON/OFF): Enable SYCL compilation for HIP target (Not tested)
-	+ `SYCL_HIP_TARGET`: SYCL HIP arch target (Not tested)
 	+ `SYCL_HOST_FLAGS`(=list of flags with space as separator): Additional flags for host compiler (for future use)
 	+ `SYCL_COMPILER_NAME`: SYCL compiler command name (for future use)
 	+ `SYCL_COMPILE_FLAGS`: Additional SYCL compile flags (for future use)

diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt
@@ -358,19 +358,22 @@ elseif(SYCL)
 		endif(SYCL_CUDA_COMPILE)
 		if(SYCL_HIP_COMPILE)
 			if(SYCL_CUDA_COMPILE)
+				set(REL_SYCL_COMPILE_TOOLCHAIN ${REL_SYCL_COMPILE_TOOLCHAIN} -DEIGEN_NO_HIP)
 				set(REL_SYCL_TARGETS ${REL_SYCL_TARGETS},amdgcn-amd-amdhsa)
 			else(SYCL_CUDA_COMPILE)
+				set(REL_SYCL_COMPILE_TOOLCHAIN ${REL_SYCL_COMPILE_TOOLCHAIN} -DEIGEN_NO_CUDA -DEIGEN_NO_HIP)
 				set(REL_SYCL_TARGETS amdgcn-amd-amdhsa)
 			endif(SYCL_CUDA_COMPILE)
 #			set(REL_SYCL_FRONTEND ${REL_SYCL_FRONTEND} -Xsycl-target-frontend=amdgcn-amd-amdhsa "  -fp-model=fast")
 			set(REL_SYCL_BACKEND ${REL_SYCL_BACKEND} -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${SYCL_HIP_TARGET})
 #			set(REL_SYCL_BACKEND ${REL_SYCL_BACKEND} -Xsycl-target-backend=amdgcn-amd-amdhsa "--offload-arch=${SYCL_HIP_TARGET} -options -cl-fast-relaxed-math")
-			set(REL_SYCL_COMPILE_TOOLCHAIN ${REL_SYCL_COMPILE_TOOLCHAIN} -DEIGEN_NO_CUDA -DEIGEN_NO_HIP)
 		endif()
 		set(REL_SYCL_TARGETS ${REL_SYCL_TARGETS},spir64)
 #		set(REL_SYCL_FRONTEND ${REL_SYCL_FRONTEND} -Xsycl-target-frontend=spir64 "-fp-model=fast")
 		set(REL_SYCL_COMPILE_TOOLCHAIN ${REL_SYCL_COMPILE_TOOLCHAIN} -Wno-unused-command-line-argument -DSYCL_USE_NATIVE_FP_ATOMICS -fsycl -fsycl-unnamed-lambda -fsycl-targets=${REL_SYCL_TARGETS} ${REL_SYCL_FRONTEND} ${REL_SYCL_BACKEND})
-		target_compile_options(relion_sycl_util PRIVATE ${REL_SYCL_COMPILE_TOOLCHAIN})
+#		target_compile_options(relion_sycl_util PRIVATE ${REL_SYCL_COMPILE_TOOLCHAIN})
+		string(REPLACE ";" " " REL_SYCL_COMPILE_TOOLCHAIN "${REL_SYCL_COMPILE_TOOLCHAIN}")
+		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${REL_SYCL_COMPILE_TOOLCHAIN}" CACHE STRING "" FORCE)
 
 	else(DEFINED SYCL_CUDA_COMPILE OR DEFINED SYCL_HIP_COMPILE)
 		target_compile_options(relion_sycl_util PRIVATE -Wno-unused-command-line-argument -DSYCL_USE_NATIVE_FP_ATOMICS -fsycl -fsycl-unnamed-lambda -fsycl-targets=spir64)
@@ -456,7 +459,7 @@ foreach (_target ${RELION_TARGETS})
 				if(SYCL_CUDA_COMPILE)
 					set(REL_SYCL_TARGET_LINK -Wno-unknown-cuda-version)
 				endif(SYCL_CUDA_COMPILE)
-				set(REL_SYCL_TARGET_LINK ${REL_SYCL_TARGET_LINK} -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=${REL_SYCL_TARGETS},spir64 ${REL_SYCL_BACKEND})
+				set(REL_SYCL_TARGET_LINK ${REL_SYCL_TARGET_LINK} -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=${REL_SYCL_TARGETS} ${REL_SYCL_BACKEND})
 			else(DEFINED SYCL_CUDA_COMPILE OR DEFINED SYCL_HIP_COMPILE)
 				set(REL_SYCL_TARGET_LINK -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64)
 #				set(REL_SYCL_TARGET_LINK -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64 -Xsycl-target-backend=spir64 "-options -cl-fast-relaxed-math")

diff --git a/src/mpi.cpp b/src/mpi.cpp
@@ -118,9 +118,9 @@ MpiNode::MpiNode(int &argc, char ** argv)
 	char* pEnvP2P   = std::getenv("MAX_MPI_P2P_BLOCK"); // For pt-2-pt blocking size
 	char* pEnvColl  = std::getenv("MAX_MPI_COLL_BLOCK");// For collective blocking size
 
-	const std::ptrdiff_t lBlock = (pEnvBlock == NULL) ? 0 : std::strtoul(pEnvBlock, &pEnvBlock, 10);
-	const std::ptrdiff_t lP2P   = (pEnvP2P == NULL)   ? 0 : std::strtoul(pEnvP2P, &pEnvP2P, 10);
-	const std::ptrdiff_t lColl  = (pEnvColl == NULL)  ? 0 : std::strtoul(pEnvColl, &pEnvColl, 10);
+	const std::ptrdiff_t lBlock = (pEnvBlock == NULL) ? 0 : std::strtoul(pEnvBlock, nullptr, 10);
+	const std::ptrdiff_t lP2P   = (pEnvP2P == NULL)   ? 0 : std::strtoul(pEnvP2P, nullptr, 10);
+	const std::ptrdiff_t lColl  = (pEnvColl == NULL)  ? 0 : std::strtoul(pEnvColl, nullptr, 10);
 
 	if (lBlock > 0) // "MAX_MPI_BLOCK" has precedence if it is set
 	{