Skip to content

Commit

Permalink
Merge pull request #1 from hpcgroup/raja-update
Browse files Browse the repository at this point in the history
RAJA Update
  • Loading branch information
pranav-sivaraman authored Jan 24, 2024
2 parents 3d93e15 + 088049f commit 9f26acb
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 120 deletions.
75 changes: 39 additions & 36 deletions src/raja/fasten.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#include <string>

#include "RAJA/RAJA.hpp"
#include "umpire/Allocator.hpp"
#include "umpire/ResourceManager.hpp"
#include "camp/resource.hpp"

#ifdef IMPL_CLS
Expand All @@ -16,31 +18,31 @@

template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {

using launch_policy = RAJA::expt::LaunchPolicy< //
using launch_policy = RAJA::LaunchPolicy< //
#if defined(RAJA_ENABLE_OPENMP)
RAJA::expt::omp_launch_t
RAJA::omp_launch_t
#else
RAJA::expt::seq_launch_t
RAJA::seq_launch_t
#endif
#if defined(RAJA_ENABLE_CUDA)
,
RAJA::expt::cuda_launch_t<false>
RAJA::cuda_launch_t<false>
#endif
#if defined(RAJA_ENABLE_HIP)
,
RAJA::expt::hip_launch_t<false>
RAJA::hip_launch_t<false>
#endif
#if defined(RAJA_ENABLE_SYCL)
,
RAJA::expt::sycl_launch_t<false>
RAJA::sycl_launch_t<false>
#endif
>;

using teams_x = RAJA::expt::LoopPolicy< //
using teams_x = RAJA::LoopPolicy< //
#if defined(RAJA_ENABLE_OPENMP)
RAJA::omp_parallel_for_exec
#else
RAJA::loop_exec
RAJA::seq_exec
#endif
#if defined(RAJA_ENABLE_CUDA)
,
Expand All @@ -52,8 +54,8 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
#endif
>;

using threads_x = RAJA::expt::LoopPolicy< //
RAJA::loop_exec
using threads_x = RAJA::LoopPolicy< //
RAJA::seq_exec
#if defined(RAJA_ENABLE_CUDA)
,
RAJA::cuda_thread_x_loop
Expand Down Expand Up @@ -81,11 +83,11 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
global = int(std::ceil(double(global) / double(wgsize)));
size_t local = int(wgsize);

RAJA::expt::launch<launch_policy>( //
static_cast<RAJA::expt::ExecPlace>(device), //
RAJA::expt::Grid(RAJA::expt::Teams(global), RAJA::expt::Threads(local)), //
[=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { //
RAJA::expt::loop<teams_x>(ctx, RAJA::RangeSegment(0, global), [&](int gid) {
RAJA::launch<launch_policy>( //
static_cast<RAJA::ExecPlace>(device), //
RAJA::LaunchParams(RAJA::Teams(global), RAJA::Threads(local)), //
[=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { //
RAJA::loop<teams_x>(ctx, RAJA::RangeSegment(0, global), [&](int gid) {
#ifdef USE_LOCAL_ARRAY
#error RAJA does not appear to support dynamically allocated LocalArray w/ the shared memory policy
RAJA_TEAM_SHARED FFParams *local_forcefield;
Expand All @@ -95,7 +97,7 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
float etot[PPWI];
float transform[3][4][PPWI];

RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(0, local), [&](int lid) {
RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(0, local), [&](int lid) {
size_t ix = gid * local * PPWI + lid;
ix = ix < nposes ? ix : nposes - PPWI;

Expand Down Expand Up @@ -135,9 +137,10 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
local_forcefield = forcefields;
#endif
});

ctx.teamSync();

RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(0, local), [&](int lid) {
RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(0, local), [&](int lid) {
// Loop over ligand atoms
size_t il = 0;
do {
Expand Down Expand Up @@ -232,32 +235,32 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
std::copy(xs.begin(), xs.end(), data);
return data;
}

template <typename T> static T *allocate(const size_t size) {
#ifndef RAJA_DEVICE_ACTIVE
return static_cast<T *>(std::malloc(sizeof(T) * size));
auto &rm = umpire::ResourceManager::getInstance();
#ifndef RAJA_TARGET_GPU
auto alloc = rm.getAllocator("HOST");
#else
T *ptr;
cudaMallocManaged((void **)&ptr, sizeof(T) * size, cudaMemAttachGlobal);
return ptr;
auto alloc = rm.getAllocator("UM");
#endif
return static_cast<T *>(alloc.allocate(sizeof(T) * size));
}

template <typename T> static void deallocate(T *ptr) {
#ifndef RAJA_DEVICE_ACTIVE
std::free(ptr);
#else
cudaFree(ptr);
#endif
auto &rm = umpire::ResourceManager::getInstance();
rm.getAllocator(ptr).deallocate(ptr);
}

static void synchronise() {
// nothing to do for host devices
#if defined(RAJA_ENABLE_CUDA)
cudaDeviceSynchronize();
RAJA::synchronize<RAJA::cuda_synchronize>();
#endif
#if defined(RAJA_ENABLE_HIP)
hipDeviceSynchronize();
RAJA::synchronize<RAJA::hip_synchronize>();
#endif
#if defined(RAJA_ENABLE_SYCL)
RAJA::synchronize<RAJA::sycl_synchronize>();
#endif
}

Expand All @@ -267,26 +270,26 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
[[nodiscard]] std::string name() { return "raja"; };

[[nodiscard]] std::vector<Device> enumerateDevices() override {
std::vector<Device> devices{{RAJA::expt::ExecPlace::HOST, "RAJA Host device"}};
#if defined(RAJA_DEVICE_ACTIVE)
std::vector<Device> devices{{(size_t) RAJA::ExecPlace::HOST, "RAJA Host device"}};
#if defined(RAJA_TARGET_GPU)
#if defined(RAJA_ENABLE_CUDA)
const auto deviceName = "RAJA CUDA device";
#endif
#if defined(RAJA_ENABLE_HIP)
const auto deviceName = "Raja HIP device";
const auto deviceName = "RAJA HIP device";
#endif
#if defined(RAJA_ENABLE_SYCL)
const auto deviceName = "Raja SYCL device";
const auto deviceName = "RAJA SYCL device";
#endif
devices.template emplace_back(RAJA::expt::ExecPlace::DEVICE, deviceName);
devices.template emplace_back((size_t) RAJA::ExecPlace::DEVICE, deviceName);
#endif
return devices;
};

[[nodiscard]] Sample fasten(const Params &p, size_t wgsize, size_t device) const override {

Sample sample(PPWI, wgsize, p.nposes());

auto contextStart = now();

auto protein = allocate(p.protein);
Expand Down
115 changes: 31 additions & 84 deletions src/raja/model.cmake
Original file line number Diff line number Diff line change
@@ -1,93 +1,40 @@
register_flag_optional(RAJA_BACK_END "Specify whether we target CPU/CUDA/HIP/SYCL" "CPU")

register_flag_optional(CMAKE_CXX_COMPILER
"Any CXX compiler that is supported by CMake detection and RAJA.
See https://raja.readthedocs.io/en/main/getting_started.html#build-and-install"
"c++")

register_flag_required(RAJA_IN_TREE
"Absolute path to the *source* distribution directory of RAJA.
Make sure to use the release version of RAJA or clone RAJA recursively with submodules.
Remember to append RAJA specific flags as well, for example:
-DRAJA_IN_TREE=... -DENABLE_OPENMP=ON -DENABLE_CUDA=ON ...
See https://raja.readthedocs.io/en/v0.14.0/sphinx/user_guide/config_options.html#available-raja-options-and-defaults for all available options
")

#register_flag_optional(TARGET
# "Target offload device, implemented values are CPU, NVIDIA, HIP"
# CPU)

register_flag_optional(CUDA_TOOLKIT_ROOT_DIR
"[ENABLE_CUDA=ON only] Path to the CUDA toolkit directory (e.g `/opt/cuda-11.2`) if the ENABLE_CUDA flag is specified for RAJA" "")

# XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes
register_flag_optional(CUDA_ARCH
"[ENABLE_CUDA=ON only] Nvidia architecture, will be passed in via `-arch=` (e.g `sm_70`) for nvcc"
"")

register_flag_optional(CUDA_EXTRA_FLAGS
"[ENABLE_CUDA=ON only] Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`"
"")

# compiler vendor and arch specific flags
set(RAJA_FLAGS_CPU_INTEL -qopt-streaming-stores=always)
register_flag_optional(MANAGED_ALLOC "Use UVM (cudaMallocManaged) instead of the device-only allocation (cudaMalloc)"
"OFF")

macro(setup)
if (POLICY CMP0104)
cmake_policy(SET CMP0104 OLD)
endif ()

set(CMAKE_CXX_STANDARD 17)

find_package(RAJA REQUIRED)
find_package(umpire REQUIRED)

if (EXISTS "${RAJA_IN_TREE}")

message(STATUS "Building using in-tree RAJA source at `${RAJA_IN_TREE}`")

set(CMAKE_CUDA_STANDARD 17)

# don't build anything that isn't the RAJA library itself, by default their cmake def builds everything, whyyy?
set(RAJA_ENABLE_TESTS OFF CACHE BOOL "")
set(RAJA_ENABLE_EXAMPLES OFF CACHE BOOL "")
set(RAJA_ENABLE_EXERCISES OFF CACHE BOOL "")
set(RAJA_ENABLE_BENCHMARKS OFF CACHE BOOL "")
set(ENABLE_REPRODUCERS OFF CACHE BOOL "")
set(ENABLE_DOCUMENTATION OFF CACHE BOOL "")

if (ENABLE_CUDA)

set(ENABLE_CUDA ON CACHE BOOL "")

# XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes
if(POLICY CMP0104)
set(CMAKE_POLICY_DEFAULT_CMP0104 OLD) # so that propogates to RAJA's CMakeList as well
cmake_policy(SET CMP0104 OLD)
endif()

# RAJA needs all the cuda stuff setup before including!
set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc)
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler -extended-lambda -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS})

message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS}")
endif ()


add_subdirectory(${RAJA_IN_TREE} ${CMAKE_BINARY_DIR}/raja)
register_link_library(RAJA)
# RAJA's cmake screws with where the binary will end up, resetting it here:
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
else ()
message(FATAL_ERROR "`${RAJA_IN_TREE}` does not exist")
endif ()

if (ENABLE_CUDA)
# RAJA needs the codebase to be compiled with nvcc, so we tell cmake to treat sources as *.cu
register_link_library(RAJA umpire)
if (${RAJA_BACK_END} STREQUAL "CUDA")
enable_language(CUDA)
set_source_files_properties(src/main.cpp PROPERTIES LANGUAGE CUDA)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)

set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -extended-lambda --expt-relaxed-constexpr --restrict --keep")

set_source_files_properties(${IMPL_SOURCES} PROPERTIES LANGUAGE CUDA)
register_definitions(RAJA_TARGET_GPU)
elseif (${RAJA_BACK_END} STREQUAL "HIP")
# Set CMAKE_CXX_COMPILER to hipcc
find_package(hip REQUIRED)
register_definitions(RAJA_TARGET_GPU)
elseif (${RAJA_BACK_END} STREQUAL "SYCL")
register_definitions(RAJA_TARGET_GPU)
else()
register_definitions(RAJA_TARGET_CPU)
message(STATUS "Falling Back to CPU")
endif ()

if (MANAGED_ALLOC)
register_definitions(BUDE_MANAGED_ALLOC)
endif ()

register_append_compiler_and_arch_specific_cxx_flags(
RAJA_FLAGS_CPU
${CMAKE_CXX_COMPILER_ID}
${CMAKE_SYSTEM_PROCESSOR}
)

endmacro()

0 comments on commit 9f26acb

Please sign in to comment.