Merge pull request #1039 from 3dem/sycl_memory_pool

Implemented memory pool for SYCL device/host memory
3dem · Dec 4, 2023 · e368e1c · e368e1c
2 parents fb5d7c9 + 6bba191
commit e368e1c
Show file tree

Hide file tree

Showing 27 changed files with 857 additions and 209 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -242,12 +242,12 @@ elseif(SYCL)
 	add_definitions(-D_SYCL_ENABLED=1 -DUSE_MPI_COLLECTIVE)
 	if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "IntelLLVM" OR "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" OR "${CMAKE_CXX_COMPILER}" MATCHES "mpiicpx")
 		add_definitions(-D_DPCPP_ENABLED=1)
-		SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsigned-zeros")
+		SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fhonor-infinities -fhonor-nans -fsigned-zeros")
 	endif()
 elseif(ALTCPU)
 	add_definitions(-DALTCPU=1 -DUSE_MPI_COLLECTIVE)
 	if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "IntelLLVM" OR "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-		SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsigned-zeros")
+		SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fhonor-infinities -fhonor-nans -fsigned-zeros")
 	endif()
 endif()
 
@@ -422,7 +422,7 @@ if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Intel" OR "${CMAKE_CXX_COMPILER}" MATCHES
 	endif(MKLFFT)
 
 	if(INTEL_IPP OR DEFINED ENV{IPPROOT})
-		if(EXISTS "$ENV{IPPROOT}/include/ipps.h" AND EXISTS "$ENV{IPPROOT}/lib/intel64/libipps.a")
+		if(EXISTS "$ENV{IPPROOT}/include/ipp.h" AND EXISTS "$ENV{IPPROOT}/lib/intel64/libipps.a")
 			if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "IntelLLVM")
 				SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qipp")
 				SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -qipp -qipp-link=static")

diff --git a/README_sycl.md b/README_sycl.md
@@ -1,6 +1,6 @@
 # RELION SYCL/DPC++ version
 
-This is SYCL/DPC++ version for [RELION](https://github.com/3dem/relion)
+This is SYCL/DPC++ version of [RELION](https://github.com/3dem/relion)
 
 ## Build & Running
 
@@ -40,38 +40,40 @@ $ cmake \
 $ #### This is Intel GPU Level Zero backend specific #####
 $ export ZE_AFFINITY_MASK=0         # Use only the first available Level Zero device. This can be replaced by --gpu 0 syntax.
 $ export ZEX_NUMBER_OF_CCS=0:4,1:4  # Set this only if you are putting more than one MPI ranks per GPU. 0:4 means 4 MPI ranks running on card 0
-$ export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
-$ export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
-$ export SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR="1;4G;host:16M,512,64K;device:16M,1024,64K;shared:0,0,64K"
 $ #### End of Intel GPU Level Zero backend specific  #####
 $ # For finer control of SYCL devcices, please see the above descrpition on ONEAPI_DEVICE_SELECTOR
-$ 
-$ ulimit -n 512000 # this is necessary for multi-GPU jobs
 $ {Run 2D/3D/refinement application by replacing --gpu/--cpu with --gpu/--sycl/--sycl-opencl/--sycl-cpu/--sycl-cuda/--sycl-hip}
 ```
 
+
 ## Optional runtime environment variables
 
 + The below shell environment variables can be tested for more potential SYCL specific tuning. Setting it to "1" or "on" will enable these features.
-	+ `relionSyclUseCuda`: --sycl-cuda will be used even if --gpu/--sycl is specified in command lines
-	+ `relionSyclUseHip`: --sycl-hip will be used even if --gpu/--sycl is specified in command lines
+	+ `relionSyclUseCuda`: This with --gpu have the same meaning as --sycl-cuda. --sycl-cuda will be used even if --gpu/--sycl is specified in command lines.
+	+ `relionSyclUseHip`: This with --gpu have the same meaning as --sycl-hip. --sycl-hip will be used even if --gpu/--sycl is specified in command lines
 	+ `relionSyclUseInOrderQueue`: Use in-order SYCL queue. Without this, out-of-order SYCL queue is used by default. (experimental)
 	+ `relionSyclUseAsyncSubmission`: Remove wait() for each SYCL kernel submission. (experimental)
 	+ `relionSyclUseStream`: Create new in-order SYCL queue for each cudaStream. (experimental)
 	+ `relionSyclUseSubSubDevice`: Create separate SYCL queue for each CCS. (experimental)
+	+ `relionSyclBlockSize`: SYCL memory pool block size. This takes precedence over relionSyclHostBlockSize and relionSyclDeviceBlockSize.
+	+ `relionSyclHostBlockSize`: SYCL memory pool block size for sycl::malloc_host. Default is 256MB
+	+ `relionSyclDeviceBlockSize`: SYCL memory pool block size for sycl::malloc_device. Default is 256MB
+	+ `MAX_MPI_BLOCK`: Maximum MPI message size per single MPI API call for point-to-point and collective communication. This takes precedence over MAX_MPI_P2P_BLOCK and MAX_MPI_COLL_BLOCK.
+	+ `MAX_MPI_P2P_BLOCK`: Maximum MPI message size per single MPI API call for point-to-point communication. Default is 4GB.
+	+ `MAX_MPI_COLL_BLOCK`: Maximum MPI message size per single MPI API call for collective communication. Default is 64MB.
 
 
 ## Added macros
 
 + For CMake configuration
 	+ `SYCL`(=ON/OFF): Enable SYCL based acceleration build
+	+ `SYCL_CUDA_COMPILE`(=ON/OFF): Enable SYCL compilation for CUDA target
+	+ `SYCL_CUDA_TARGET`: SYCL CUDA arch target (i.e. 80)
+	+ `SYCL_HIP_COMPILE`(=ON/OFF): Enable SYCL compilation for HIP target
+	+ `SYCL_HIP_TARGET`: SYCL HIP arch target (i.e gfx90a)
 	+ `SyclForceOneDPL`(=ON/OFF): Use oneDPL(https://github.com/oneapi-src/oneDPL) if it can be used. This has the same effect as setting "-DUSE_ONEDPL" for CMAKE_CXX_FLAGS below. (experimental)
 	+ `SYCL_AOT_COMPILE`(=ON/OFF): Enable AOT(Ahead-Of-Time) compilation for SPIR64 target. Default target is pvc. (for future use)
 	+ `SYCL_AOT_TARGET`(=ON/OFF): Specify AOT(Ahead-Of-Time) SPIR64 target. Possible list can be checked using "ocloc compile --help" command. (for future use)
-	+ `SYCL_CUDA_COMPILE`(=ON/OFF): Enable SYCL compilation for CUDA target (Not tested)
-	+ `SYCL_CUDA_TARGET`: SYCL CUDA arch target (Not tested)
-	+ `SYCL_HIP_COMPILE`(=ON/OFF): Enable SYCL compilation for HIP target (Not tested)
-	+ `SYCL_HIP_TARGET`: SYCL HIP arch target (Not tested)
 	+ `SYCL_HOST_FLAGS`(=list of flags with space as separator): Additional flags for host compiler (for future use)
 	+ `SYCL_COMPILER_NAME`: SYCL compiler command name (for future use)
 	+ `SYCL_COMPILE_FLAGS`: Additional SYCL compile flags (for future use)

diff --git a/src/acc/acc_ml_optimiser_impl.h b/src/acc/acc_ml_optimiser_impl.h
@@ -2015,8 +2015,8 @@ void convertAllSquaredDifferencesToWeights(unsigned exp_ipass,
             PassWeights.weights.setAccPtr(&(~Mweight)[0]);
             PassWeights.weights.setHostPtr(&Mweight[0]);
             PassWeights.weights.setSize(nr_coarse_weights);
+            PassWeights.weights.doFreeHost=false;
         }
-        PassWeights.weights.doFreeHost=false;
 
         std::pair<size_t, XFLOAT> min_pair=AccUtilities::getArgMinOnDevice<XFLOAT>(PassWeights.weights);
         PassWeights.weights.cpToHost();

diff --git a/src/acc/acc_ptr.h b/src/acc/acc_ptr.h
@@ -557,13 +557,11 @@ class AccPtr
 				ACC_PTR_DEBUG_FATAL(str.c_str());
 				CRITICAL(RAMERR);
 			}
-			isHostSYCL = true;
 		}
 		else
 		{
 			if(posix_memalign((void **)&newArr, MEM_ALIGN, sizeof(T) * newSize))
 				CRITICAL(RAMERR);
-			isHostSYCL = false;
 		}
 #else
 		if(posix_memalign((void **)&newArr, MEM_ALIGN, sizeof(T) * newSize))
@@ -580,6 +578,12 @@ class AccPtr
 		freeHostIfSet();	
 	    setSize(newSize);
 	    setHostPtr(newArr);
+#ifdef _SYCL_ENABLED
+		if(accType == accSYCL)
+			isHostSYCL = true;
+		else
+			isHostSYCL = false;
+#endif
 	    doFreeHost=true;
 	}
 
@@ -602,13 +606,11 @@ class AccPtr
 				ACC_PTR_DEBUG_FATAL(str.c_str());
 				CRITICAL(RAMERR);
 			}
-			isHostSYCL = true;
 		}
 		else
 		{
 			if(posix_memalign((void **)&newArr, MEM_ALIGN, sizeof(T) * newSize))
 				CRITICAL(RAMERR);
-			isHostSYCL = false;
 		}
 #else
 		if(posix_memalign((void **)&newArr, MEM_ALIGN, sizeof(T) * newSize))
@@ -646,6 +648,12 @@ class AccPtr
 		freeHostIfSet();
 	    setSize(newSize);
 	    setHostPtr(newArr);
+#ifdef _SYCL_ENABLED
+		if(accType == accSYCL)
+			isHostSYCL = true;
+		else
+			isHostSYCL = false;
+#endif
 	    doFreeHost=true;
 	}