Skip to content

Commit

Permalink
Merge pull request #1039 from 3dem/sycl_memory_pool
Browse files Browse the repository at this point in the history
Implemented memory pool for SYCL device/host memory
  • Loading branch information
biochem-fan authored Dec 4, 2023
2 parents fb5d7c9 + 6bba191 commit e368e1c
Show file tree
Hide file tree
Showing 27 changed files with 857 additions and 209 deletions.
6 changes: 3 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -242,12 +242,12 @@ elseif(SYCL)
add_definitions(-D_SYCL_ENABLED=1 -DUSE_MPI_COLLECTIVE)
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "IntelLLVM" OR "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" OR "${CMAKE_CXX_COMPILER}" MATCHES "mpiicpx")
add_definitions(-D_DPCPP_ENABLED=1)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsigned-zeros")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fhonor-infinities -fhonor-nans -fsigned-zeros")
endif()
elseif(ALTCPU)
add_definitions(-DALTCPU=1 -DUSE_MPI_COLLECTIVE)
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "IntelLLVM" OR "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsigned-zeros")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fhonor-infinities -fhonor-nans -fsigned-zeros")
endif()
endif()

Expand Down Expand Up @@ -422,7 +422,7 @@ if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Intel" OR "${CMAKE_CXX_COMPILER}" MATCHES
endif(MKLFFT)

if(INTEL_IPP OR DEFINED ENV{IPPROOT})
if(EXISTS "$ENV{IPPROOT}/include/ipps.h" AND EXISTS "$ENV{IPPROOT}/lib/intel64/libipps.a")
if(EXISTS "$ENV{IPPROOT}/include/ipp.h" AND EXISTS "$ENV{IPPROOT}/lib/intel64/libipps.a")
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "IntelLLVM")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qipp")
SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -qipp -qipp-link=static")
Expand Down
26 changes: 14 additions & 12 deletions README_sycl.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# RELION SYCL/DPC++ version

This is SYCL/DPC++ version for [RELION](https://github.com/3dem/relion)
This is SYCL/DPC++ version of [RELION](https://github.com/3dem/relion)

## Build & Running

Expand Down Expand Up @@ -40,38 +40,40 @@ $ cmake \
$ #### This is Intel GPU Level Zero backend specific #####
$ export ZE_AFFINITY_MASK=0 # Use only the first available Level Zero device. This can be replaced by --gpu 0 syntax.
$ export ZEX_NUMBER_OF_CCS=0:4,1:4 # Set this only if you are putting more than one MPI ranks per GPU. 0:4 means 4 MPI ranks running on card 0
$ export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
$ export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
$ export SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR="1;4G;host:16M,512,64K;device:16M,1024,64K;shared:0,0,64K"
$ #### End of Intel GPU Level Zero backend specific #####
$ # For finer control of SYCL devcices, please see the above descrpition on ONEAPI_DEVICE_SELECTOR
$
$ ulimit -n 512000 # this is necessary for multi-GPU jobs
$ {Run 2D/3D/refinement application by replacing --gpu/--cpu with --gpu/--sycl/--sycl-opencl/--sycl-cpu/--sycl-cuda/--sycl-hip}
```


## Optional runtime environment variables

+ The below shell environment variables can be tested for more potential SYCL specific tuning. Setting it to "1" or "on" will enable these features.
+ `relionSyclUseCuda`: --sycl-cuda will be used even if --gpu/--sycl is specified in command lines
+ `relionSyclUseHip`: --sycl-hip will be used even if --gpu/--sycl is specified in command lines
+ `relionSyclUseCuda`: This with --gpu have the same meaning as --sycl-cuda. --sycl-cuda will be used even if --gpu/--sycl is specified in command lines.
+ `relionSyclUseHip`: This with --gpu have the same meaning as --sycl-hip. --sycl-hip will be used even if --gpu/--sycl is specified in command lines
+ `relionSyclUseInOrderQueue`: Use in-order SYCL queue. Without this, out-of-order SYCL queue is used by default. (experimental)
+ `relionSyclUseAsyncSubmission`: Remove wait() for each SYCL kernel submission. (experimental)
+ `relionSyclUseStream`: Create new in-order SYCL queue for each cudaStream. (experimental)
+ `relionSyclUseSubSubDevice`: Create separate SYCL queue for each CCS. (experimental)
+ `relionSyclBlockSize`: SYCL memory pool block size. This takes precedence over relionSyclHostBlockSize and relionSyclDeviceBlockSize.
+ `relionSyclHostBlockSize`: SYCL memory pool block size for sycl::malloc_host. Default is 256MB
+ `relionSyclDeviceBlockSize`: SYCL memory pool block size for sycl::malloc_device. Default is 256MB
+ `MAX_MPI_BLOCK`: Maximum MPI message size per single MPI API call for point-to-point and collective communication. This takes precedence over MAX_MPI_P2P_BLOCK and MAX_MPI_COLL_BLOCK.
+ `MAX_MPI_P2P_BLOCK`: Maximum MPI message size per single MPI API call for point-to-point communication. Default is 4GB.
+ `MAX_MPI_COLL_BLOCK`: Maximum MPI message size per single MPI API call for collective communication. Default is 64MB.


## Added macros

+ For CMake configuration
+ `SYCL`(=ON/OFF): Enable SYCL based acceleration build
+ `SYCL_CUDA_COMPILE`(=ON/OFF): Enable SYCL compilation for CUDA target
+ `SYCL_CUDA_TARGET`: SYCL CUDA arch target (i.e. 80)
+ `SYCL_HIP_COMPILE`(=ON/OFF): Enable SYCL compilation for HIP target
+ `SYCL_HIP_TARGET`: SYCL HIP arch target (i.e gfx90a)
+ `SyclForceOneDPL`(=ON/OFF): Use oneDPL(https://github.com/oneapi-src/oneDPL) if it can be used. This has the same effect as setting "-DUSE_ONEDPL" for CMAKE_CXX_FLAGS below. (experimental)
+ `SYCL_AOT_COMPILE`(=ON/OFF): Enable AOT(Ahead-Of-Time) compilation for SPIR64 target. Default target is pvc. (for future use)
+ `SYCL_AOT_TARGET`(=ON/OFF): Specify AOT(Ahead-Of-Time) SPIR64 target. Possible list can be checked using "ocloc compile --help" command. (for future use)
+ `SYCL_CUDA_COMPILE`(=ON/OFF): Enable SYCL compilation for CUDA target (Not tested)
+ `SYCL_CUDA_TARGET`: SYCL CUDA arch target (Not tested)
+ `SYCL_HIP_COMPILE`(=ON/OFF): Enable SYCL compilation for HIP target (Not tested)
+ `SYCL_HIP_TARGET`: SYCL HIP arch target (Not tested)
+ `SYCL_HOST_FLAGS`(=list of flags with space as separator): Additional flags for host compiler (for future use)
+ `SYCL_COMPILER_NAME`: SYCL compiler command name (for future use)
+ `SYCL_COMPILE_FLAGS`: Additional SYCL compile flags (for future use)
Expand Down
2 changes: 1 addition & 1 deletion src/acc/acc_ml_optimiser_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -2015,8 +2015,8 @@ void convertAllSquaredDifferencesToWeights(unsigned exp_ipass,
PassWeights.weights.setAccPtr(&(~Mweight)[0]);
PassWeights.weights.setHostPtr(&Mweight[0]);
PassWeights.weights.setSize(nr_coarse_weights);
PassWeights.weights.doFreeHost=false;
}
PassWeights.weights.doFreeHost=false;

std::pair<size_t, XFLOAT> min_pair=AccUtilities::getArgMinOnDevice<XFLOAT>(PassWeights.weights);
PassWeights.weights.cpToHost();
Expand Down
16 changes: 12 additions & 4 deletions src/acc/acc_ptr.h
Original file line number Diff line number Diff line change
Expand Up @@ -557,13 +557,11 @@ class AccPtr
ACC_PTR_DEBUG_FATAL(str.c_str());
CRITICAL(RAMERR);
}
isHostSYCL = true;
}
else
{
if(posix_memalign((void **)&newArr, MEM_ALIGN, sizeof(T) * newSize))
CRITICAL(RAMERR);
isHostSYCL = false;
}
#else
if(posix_memalign((void **)&newArr, MEM_ALIGN, sizeof(T) * newSize))
Expand All @@ -580,6 +578,12 @@ class AccPtr
freeHostIfSet();
setSize(newSize);
setHostPtr(newArr);
#ifdef _SYCL_ENABLED
if(accType == accSYCL)
isHostSYCL = true;
else
isHostSYCL = false;
#endif
doFreeHost=true;
}

Expand All @@ -602,13 +606,11 @@ class AccPtr
ACC_PTR_DEBUG_FATAL(str.c_str());
CRITICAL(RAMERR);
}
isHostSYCL = true;
}
else
{
if(posix_memalign((void **)&newArr, MEM_ALIGN, sizeof(T) * newSize))
CRITICAL(RAMERR);
isHostSYCL = false;
}
#else
if(posix_memalign((void **)&newArr, MEM_ALIGN, sizeof(T) * newSize))
Expand Down Expand Up @@ -646,6 +648,12 @@ class AccPtr
freeHostIfSet();
setSize(newSize);
setHostPtr(newArr);
#ifdef _SYCL_ENABLED
if(accType == accSYCL)
isHostSYCL = true;
else
isHostSYCL = false;
#endif
doFreeHost=true;
}

Expand Down
Loading

0 comments on commit e368e1c

Please sign in to comment.