From 9cd97ac39f4816541df595893acce64fe7308695 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Wed, 2 Aug 2023 22:02:38 +0900 Subject: [PATCH 1/2] Add OpenMP backend for executor implementations --- mini-apps/heat3d-mpi/executors/grid.hpp | 21 ++++++--- mini-apps/heat3d-mpi/executors/heat3D.hpp | 6 +++ mini-apps/heat3d-mpi/executors/mpi_comm.hpp | 15 ++++-- mini-apps/heat3d-mpi/executors/variable.hpp | 10 +++- mini-apps/heat3d-mpi/stdpar/heat3D.hpp | 6 +++ mini-apps/heat3d/executors/CMakeLists.txt | 7 +++ mini-apps/heat3d/executors/grid.hpp | 23 ++++++--- mini-apps/heat3d/executors/heat3D.cpp | 17 +++++-- mini-apps/heat3d/executors/heat3D.hpp | 6 +++ mini-apps/heat3d/executors/variable.hpp | 9 +++- mini-apps/heat3d/stdpar/heat3D.hpp | 6 +++ tutorial/04_stream/CMakeLists.txt | 7 +++ tutorial/04_stream/stream.cpp | 39 ++++++++++++---- tutorial/04_stream/stream.hpp | 52 +++++++++++++++------ tutorial/05_heat2d/CMakeLists.txt | 7 +++ tutorial/05_heat2d/heat2D.cpp | 26 +++++++---- tutorial/05_heat2d/heat2D.hpp | 6 +++ tutorial/05_heat2d/types.hpp | 3 +- 18 files changed, 209 insertions(+), 57 deletions(-) diff --git a/mini-apps/heat3d-mpi/executors/grid.hpp b/mini-apps/heat3d-mpi/executors/grid.hpp index 81b5e4a..ffa822a 100644 --- a/mini-apps/heat3d-mpi/executors/grid.hpp +++ b/mini-apps/heat3d-mpi/executors/grid.hpp @@ -2,19 +2,24 @@ #define __GRID_HPP__ #include +#include #include #include #include "../types.hpp" #include "../config.hpp" template -inline thrust::device_vector arange(const ScalarType start, - const ScalarType stop, - const ScalarType step=1 - ) { +inline auto arange(const ScalarType start, + const ScalarType stop, + const ScalarType step=1 + ) { const size_t length = ceil((stop - start) / step); - thrust::device_vector result(length); + #if defined(ENABLE_OPENMP) + thrust::host_vector result(length); + #else + thrust::device_vector result(length); + #endif ScalarType delta = (stop - start) / length; thrust::sequence(result.begin(), result.end(), start, delta); @@ -27,7 +32,11 @@ struct Grid { using Shape1D = shape_type<1>; private: - thrust::device_vector x_, y_, z_; + #if defined(ENABLE_OPENMP) + thrust::host_vector x_, y_, z_; + #else + thrust::device_vector x_, y_, z_; + #endif Shape1D extents_nx_, extents_ny_, extents_nz_; public: diff --git a/mini-apps/heat3d-mpi/executors/heat3D.hpp b/mini-apps/heat3d-mpi/executors/heat3D.hpp index 349e0ea..923a982 100644 --- a/mini-apps/heat3d-mpi/executors/heat3D.hpp +++ b/mini-apps/heat3d-mpi/executors/heat3D.hpp @@ -167,6 +167,12 @@ static void report_performance(const Config& conf, double seconds) { // 9 Flop per iteration double GFlops = static_cast(n) * static_cast(conf.nbiter_) * 9 / 1.e9; + #if defined(ENABLE_OPENMP) + std::cout << "OpenMP backend with " << std::thread::hardware_concurrency() << " threads" << std::endl; + #else + std::cout << "CUDA backend" << std::endl; + #endif + if(conf.is_async_) { std::cout << "Communication and Computation Overlap" << std::endl; } diff --git a/mini-apps/heat3d-mpi/executors/mpi_comm.hpp b/mini-apps/heat3d-mpi/executors/mpi_comm.hpp index 53feeea..e888be4 100644 --- a/mini-apps/heat3d-mpi/executors/mpi_comm.hpp +++ b/mini-apps/heat3d-mpi/executors/mpi_comm.hpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -45,7 +46,11 @@ template struct Halo { using RealView2D = View2D; using Shpae2D = shape_type<2>; - using Vector = thrust::device_vector; + #if defined(ENABLE_OPENMP) + using Vector = thrust::host_vector; + #else + using Vector = thrust::device_vector; + #endif private: Vector left_, right_; @@ -393,10 +398,10 @@ class Comm { MPI_Waitall( 4, request, status ); } else { - thrust::device_vector& send_left_vector = send.left_vector(); - thrust::device_vector& send_right_vector = send.right_vector(); - thrust::device_vector& recv_left_vector = recv.left_vector(); - thrust::device_vector& recv_right_vector = recv.right_vector(); + auto& send_left_vector = send.left_vector(); + auto& send_right_vector = send.right_vector(); + auto& recv_left_vector = recv.left_vector(); + auto& recv_right_vector = recv.right_vector(); thrust::swap( send_left_vector, recv_right_vector ); thrust::swap( send_right_vector, recv_left_vector ); diff --git a/mini-apps/heat3d-mpi/executors/variable.hpp b/mini-apps/heat3d-mpi/executors/variable.hpp index d07a558..94e32d6 100644 --- a/mini-apps/heat3d-mpi/executors/variable.hpp +++ b/mini-apps/heat3d-mpi/executors/variable.hpp @@ -14,8 +14,14 @@ struct Variable { using Shape1D = shape_type<1>; using Shape3D = shape_type<3>; - thrust::device_vector u_, un_; - thrust::device_vector x_mask_, y_mask_, z_mask_; + + #if defined(ENABLE_OPENMP) + using Vector = thrust::host_vector; + #else + using Vector = thrust::device_vector; + #endif + Vector u_, un_; + Vector x_mask_, y_mask_, z_mask_; Shape1D extents_x_, extents_y_, extents_z_; Shape3D extents3D_; diff --git a/mini-apps/heat3d-mpi/stdpar/heat3D.hpp b/mini-apps/heat3d-mpi/stdpar/heat3D.hpp index d74750b..3cb2826 100644 --- a/mini-apps/heat3d-mpi/stdpar/heat3D.hpp +++ b/mini-apps/heat3d-mpi/stdpar/heat3D.hpp @@ -206,6 +206,12 @@ static void report_performance(const Config& conf, double seconds) { // 9 Flop per iteration const double GFlops = static_cast(n) * static_cast(conf.nbiter_) * 9 / 1.e9; + #if defined(ENABLE_OPENMP) + std::cout << "OpenMP backend" << std::endl; + #else + std::cout << "CUDA backend" << std::endl; + #endif + std::cout << "Elapsed time: " << seconds << " [s]" << std::endl; std::cout << "Bandwidth: " << GBytes / seconds << " [GB/s]" << std::endl; std::cout << "Flops: " << GFlops / seconds << " [GFlops]" << std::endl; diff --git a/mini-apps/heat3d/executors/CMakeLists.txt b/mini-apps/heat3d/executors/CMakeLists.txt index 998777b..5ba9354 100644 --- a/mini-apps/heat3d/executors/CMakeLists.txt +++ b/mini-apps/heat3d/executors/CMakeLists.txt @@ -1,3 +1,10 @@ add_executable(heat3d-exec heat3D.cpp) target_link_libraries(heat3d-exec PUBLIC Threads::Threads STDEXEC::stdexec STDEXEC::nvexec) target_compile_features(heat3d-exec PUBLIC cxx_std_20) + +set(BACKEND AUTO CACHE STRING "CHOICE OF PARALLEL BACKEND") +if(BACKEND STREQUAL "OPENMP") + find_package(OpenMP REQUIRED) + target_link_libraries(heat3d-exec PUBLIC OpenMP::OpenMP_CXX) + target_compile_definitions(heat3d-exec PUBLIC ENABLE_OPENMP THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_OMP) +endif() diff --git a/mini-apps/heat3d/executors/grid.hpp b/mini-apps/heat3d/executors/grid.hpp index 80b04a4..2656093 100644 --- a/mini-apps/heat3d/executors/grid.hpp +++ b/mini-apps/heat3d/executors/grid.hpp @@ -3,18 +3,23 @@ #include #include +#include #include #include "../types.hpp" #include "../config.hpp" template -inline thrust::device_vector arange(const ScalarType start, - const ScalarType stop, - const ScalarType step=1 - ) { +inline auto arange(const ScalarType start, + const ScalarType stop, + const ScalarType step=1 + ) { const size_t length = ceil((stop - start) / step); - thrust::device_vector result(length); + #if defined(ENABLE_OPENMP) + thrust::host_vector result(length); + #else + thrust::device_vector result(length); + #endif ScalarType delta = (stop - start) / length; thrust::sequence(result.begin(), result.end(), start, delta); @@ -26,8 +31,14 @@ struct Grid { using RealView1D = View1D; using Shape1D = shape_type<1>; + #if defined(ENABLE_OPENMP) + using Vector = thrust::host_vector; + #else + using Vector = thrust::device_vector; + #endif + private: - thrust::device_vector x_, y_, z_; + Vector x_, y_, z_; Shape1D extents_nx_, extents_ny_, extents_nz_; public: diff --git a/mini-apps/heat3d/executors/heat3D.cpp b/mini-apps/heat3d/executors/heat3D.cpp index eef311e..b5b33e0 100644 --- a/mini-apps/heat3d/executors/heat3D.cpp +++ b/mini-apps/heat3d/executors/heat3D.cpp @@ -1,11 +1,16 @@ #include -#include "nvexec/stream_context.cuh" #include "../config.hpp" #include "../parser.hpp" #include "heat3D.hpp" #include "variable.hpp" #include "grid.hpp" +#if defined(ENABLE_OPENMP) + #include +#else + #include "nvexec/stream_context.cuh" +#endif + int main(int argc, char *argv[]) { Parser parser(argc, argv); auto nx = parser.shape_[0]; @@ -18,9 +23,13 @@ int main(int argc, char *argv[]) { Grid grid(conf); Variable variables(conf); - // Declare a CUDA stream - nvexec::stream_context stream_ctx{}; - auto scheduler = stream_ctx.get_scheduler(); + #if defined(ENABLE_OPENMP) + exec::static_thread_pool pool{std::thread::hardware_concurrency()}; + auto scheduler = pool.get_scheduler(); + #else + nvexec::stream_context stream_ctx{}; + auto scheduler = stream_ctx.get_scheduler(); + #endif initialize(conf, grid, scheduler, variables); auto start = std::chrono::high_resolution_clock::now(); diff --git a/mini-apps/heat3d/executors/heat3D.hpp b/mini-apps/heat3d/executors/heat3D.hpp index 04acd50..83c1206 100644 --- a/mini-apps/heat3d/executors/heat3D.hpp +++ b/mini-apps/heat3d/executors/heat3D.hpp @@ -206,6 +206,12 @@ static void report_performance(const Config& conf, double seconds) { // 9 Flop per iteration const double GFlops = static_cast(n) * static_cast(conf.nbiter_) * 9 / 1.e9; + #if defined(ENABLE_OPENMP) + std::cout << "OpenMP backend with " << std::thread::hardware_concurrency() << " threads" << std::endl; + #else + std::cout << "CUDA backend" << std::endl; + #endif + std::cout << "Elapsed time: " << seconds << " [s]" << std::endl; std::cout << "Bandwidth: " << GBytes / seconds << " [GB/s]" << std::endl; std::cout << "Flops: " << GFlops / seconds << " [GFlops]" << std::endl; diff --git a/mini-apps/heat3d/executors/variable.hpp b/mini-apps/heat3d/executors/variable.hpp index eabc60b..16ccbc8 100644 --- a/mini-apps/heat3d/executors/variable.hpp +++ b/mini-apps/heat3d/executors/variable.hpp @@ -1,6 +1,7 @@ #ifndef __VARIABLE_HPP__ #define __VARIABLE_HPP__ +#include #include #include "../types.hpp" #include "../config.hpp" @@ -10,7 +11,13 @@ struct Variable { private: using RealView3D = View3D; using Shape3D = shape_type<3>; - thrust::device_vector u_, un_; + #if defined(ENABLE_OPENMP) + using Vector = thrust::host_vector; + #else + using Vector = thrust::device_vector; + #endif + + Vector u_, un_; Shape3D extents3D_; public: diff --git a/mini-apps/heat3d/stdpar/heat3D.hpp b/mini-apps/heat3d/stdpar/heat3D.hpp index 95b6104..71f9326 100644 --- a/mini-apps/heat3d/stdpar/heat3D.hpp +++ b/mini-apps/heat3d/stdpar/heat3D.hpp @@ -192,6 +192,12 @@ static void report_performance(const Config& conf, double seconds) { // 9 Flop per iteration const double GFlops = static_cast(n) * static_cast(conf.nbiter_) * 9 / 1.e9; + #if defined(ENABLE_OPENMP) + std::cout << "OpenMP backend" << std::endl; + #else + std::cout << "CUDA backend" << std::endl; + #endif + std::cout << "Elapsed time: " << seconds << " [s]" << std::endl; std::cout << "Bandwidth: " << GBytes / seconds << " [GB/s]" << std::endl; std::cout << "Flops: " << GFlops / seconds << " [GFlops]" << std::endl; diff --git a/tutorial/04_stream/CMakeLists.txt b/tutorial/04_stream/CMakeLists.txt index 612bc0d..447a326 100644 --- a/tutorial/04_stream/CMakeLists.txt +++ b/tutorial/04_stream/CMakeLists.txt @@ -1,3 +1,10 @@ add_executable(04_stream_test stream.cpp) target_link_libraries(04_stream_test PUBLIC Threads::Threads STDEXEC::stdexec STDEXEC::nvexec) target_compile_features(04_stream_test PUBLIC cxx_std_20) + +set(BACKEND AUTO CACHE STRING "CHOICE OF PARALLEL BACKEND") +if(BACKEND STREQUAL "OPENMP") + find_package(OpenMP REQUIRED) + target_link_libraries(04_stream_test PUBLIC OpenMP::OpenMP_CXX) + target_compile_definitions(04_stream_test PUBLIC ENABLE_OPENMP THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_OMP) +endif() diff --git a/tutorial/04_stream/stream.cpp b/tutorial/04_stream/stream.cpp index ce8e744..02f3f0f 100644 --- a/tutorial/04_stream/stream.cpp +++ b/tutorial/04_stream/stream.cpp @@ -3,15 +3,23 @@ #include #include #include +#include #include #include #include +#include #include "nvexec/stream_context.cuh" -#include "exec/on.hpp" +#include #include "stream.hpp" using counting_iterator = thrust::counting_iterator; +#if defined(ENABLE_OPENMP) + using Vector = thrust::host_vector; +#else + using Vector = thrust::device_vector; +#endif + constexpr std::size_t ARRAY_SIZE = 128*128*128*128; constexpr std::size_t nbiter = 100; constexpr double start_A = 0.1; @@ -60,9 +68,9 @@ void average(const std::size_t n, UnarayOperation const unary_op, OutputType &re template void checkSolution(const std::size_t nbiter, - const thrust::device_vector& a, - const thrust::device_vector& b, - const thrust::device_vector& c, + const Vector& a, + const Vector& b, + const Vector& c, const RealType& sum) { // Generate correct solution RealType gold_A = start_A; @@ -128,14 +136,19 @@ void checkSolution(const std::size_t nbiter, }; int main(int argc, char *argv[]) { - // Declare a CUDA stream - nvexec::stream_context stream_ctx{}; - auto scheduler = stream_ctx.get_scheduler(); + #if defined(ENABLE_OPENMP) + exec::static_thread_pool pool{std::thread::hardware_concurrency()}; + auto scheduler = pool.get_scheduler(); + #else + // Declare a CUDA stream + nvexec::stream_context stream_ctx{}; + auto scheduler = stream_ctx.get_scheduler(); + #endif // Declare device vectors - thrust::device_vector a(ARRAY_SIZE); - thrust::device_vector b(ARRAY_SIZE); - thrust::device_vector c(ARRAY_SIZE); + Vector a(ARRAY_SIZE); + Vector b(ARRAY_SIZE); + Vector c(ARRAY_SIZE); double* ptr_a = (double *)thrust::raw_pointer_cast(a.data()); double* ptr_b = (double *)thrust::raw_pointer_cast(b.data()); double sum = 0.0; @@ -198,6 +211,12 @@ int main(int argc, char *argv[]) { checkSolution(nbiter, a, b, c, sum); + #if defined(ENABLE_OPENMP) + std::cout << "OpenMP backend" << std::endl; + #else + std::cout << "CUDA backend" << std::endl; + #endif + std::cout << "function" << csv_separator << "num_times" << csv_separator diff --git a/tutorial/04_stream/stream.hpp b/tutorial/04_stream/stream.hpp index b4ed446..24b76a8 100644 --- a/tutorial/04_stream/stream.hpp +++ b/tutorial/04_stream/stream.hpp @@ -1,11 +1,17 @@ #ifndef __STREAM_HPP__ #define __STREAM_HPP__ +#include #include #include template struct init_functor{ + #if defined(ENABLE_OPENMP) + using Vector = thrust::host_vector; + #else + using Vector = thrust::device_vector; + #endif const RealType start_A_; const RealType start_B_; const RealType start_C_; @@ -16,9 +22,9 @@ struct init_functor{ init_functor(const RealType start_A, const RealType start_B, const RealType start_C, - thrust::device_vector& a, - thrust::device_vector& b, - thrust::device_vector& c) : start_A_(start_A), start_B_(start_B), start_C_(start_C) { + Vector& a, + Vector& b, + Vector& c) : start_A_(start_A), start_B_(start_B), start_C_(start_C) { ptr_a_ = (RealType *)thrust::raw_pointer_cast(a.data()); ptr_b_ = (RealType *)thrust::raw_pointer_cast(b.data()); ptr_c_ = (RealType *)thrust::raw_pointer_cast(c.data()); @@ -34,11 +40,16 @@ struct init_functor{ template struct copy_functor{ + #if defined(ENABLE_OPENMP) + using Vector = thrust::host_vector; + #else + using Vector = thrust::device_vector; + #endif RealType *ptr_a_; RealType *ptr_c_; - copy_functor(const thrust::device_vector& a, - thrust::device_vector& c) { + copy_functor(const Vector& a, + Vector& c) { ptr_a_ = (RealType *)thrust::raw_pointer_cast(a.data()); ptr_c_ = (RealType *)thrust::raw_pointer_cast(c.data()); } @@ -51,13 +62,18 @@ struct copy_functor{ template struct mul_functor{ + #if defined(ENABLE_OPENMP) + using Vector = thrust::host_vector; + #else + using Vector = thrust::device_vector; + #endif const RealType scalar_; RealType *ptr_b_; RealType *ptr_c_; mul_functor(const RealType scalar, - const thrust::device_vector& b, - thrust::device_vector& c) : scalar_(scalar) { + const Vector& b, + Vector& c) : scalar_(scalar) { ptr_b_ = (RealType *)thrust::raw_pointer_cast(b.data()); ptr_c_ = (RealType *)thrust::raw_pointer_cast(c.data()); } @@ -70,13 +86,18 @@ struct mul_functor{ template struct add_functor{ + #if defined(ENABLE_OPENMP) + using Vector = thrust::host_vector; + #else + using Vector = thrust::device_vector; + #endif RealType *ptr_a_; RealType *ptr_b_; RealType *ptr_c_; - add_functor(const thrust::device_vector& a, - const thrust::device_vector& b, - thrust::device_vector& c) { + add_functor(const Vector& a, + const Vector& b, + Vector& c) { ptr_a_ = (RealType *)thrust::raw_pointer_cast(a.data()); ptr_b_ = (RealType *)thrust::raw_pointer_cast(b.data()); ptr_c_ = (RealType *)thrust::raw_pointer_cast(c.data()); @@ -90,15 +111,20 @@ struct add_functor{ template struct triad_functor{ + #if defined(ENABLE_OPENMP) + using Vector = thrust::host_vector; + #else + using Vector = thrust::device_vector; + #endif const RealType scalar_; RealType *ptr_a_; RealType *ptr_b_; RealType *ptr_c_; triad_functor(const RealType scalar, - const thrust::device_vector& a, - const thrust::device_vector& b, - thrust::device_vector& c) : scalar_(scalar) { + const Vector& a, + const Vector& b, + Vector& c) : scalar_(scalar) { ptr_a_ = (RealType *)thrust::raw_pointer_cast(a.data()); ptr_b_ = (RealType *)thrust::raw_pointer_cast(b.data()); ptr_c_ = (RealType *)thrust::raw_pointer_cast(c.data()); diff --git a/tutorial/05_heat2d/CMakeLists.txt b/tutorial/05_heat2d/CMakeLists.txt index ceb485f..8b7a38c 100644 --- a/tutorial/05_heat2d/CMakeLists.txt +++ b/tutorial/05_heat2d/CMakeLists.txt @@ -1,3 +1,10 @@ add_executable(05_heat_test heat2D.cpp) target_link_libraries(05_heat_test PUBLIC Threads::Threads STDEXEC::stdexec STDEXEC::nvexec) target_compile_features(05_heat_test PUBLIC cxx_std_20) + +set(BACKEND AUTO CACHE STRING "CHOICE OF PARALLEL BACKEND") +if(BACKEND STREQUAL "OPENMP") + find_package(OpenMP REQUIRED) + target_link_libraries(05_heat_test PUBLIC OpenMP::OpenMP_CXX) + target_compile_definitions(05_heat_test PUBLIC ENABLE_OPENMP THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_OMP) +endif() diff --git a/tutorial/05_heat2d/heat2D.cpp b/tutorial/05_heat2d/heat2D.cpp index 52e9a84..76ae0de 100644 --- a/tutorial/05_heat2d/heat2D.cpp +++ b/tutorial/05_heat2d/heat2D.cpp @@ -7,19 +7,25 @@ #include #include #include -#include "nvexec/stream_context.cuh" +#if defined(ENABLE_OPENMP) + #include +#else + #include "nvexec/stream_context.cuh" +#endif #include "exec/on.hpp" #include "heat2D.hpp" +using Vector = thrust::device_vector; + int main(int argc, char *argv[]) { // Set configuration Config conf; // Declare grid and values - thrust::device_vector _x(conf.nx_); - thrust::device_vector _y(conf.ny_); - thrust::device_vector _u(conf.nx_ * conf.ny_); - thrust::device_vector _un(conf.nx_ * conf.ny_); + Vector _x(conf.nx_); + Vector _y(conf.ny_); + Vector _u(conf.nx_ * conf.ny_); + Vector _un(conf.nx_ * conf.ny_); // Viewed to mdspans RealView1D x( (double *)thrust::raw_pointer_cast(_x.data()), std::array({conf.nx_}) ); @@ -27,9 +33,13 @@ int main(int argc, char *argv[]) { RealView2D u( (double *)thrust::raw_pointer_cast(_u.data()), std::array({conf.nx_, conf.ny_}) ); RealView2D un( (double *)thrust::raw_pointer_cast(_un.data()), std::array({conf.nx_, conf.ny_}) ); - // Declare a CUDA stream - nvexec::stream_context stream_ctx{}; - auto scheduler = stream_ctx.get_scheduler(); + #if defined(ENABLE_OPENMP) + exec::static_thread_pool pool{std::thread::hardware_concurrency()}; + auto scheduler = pool.get_scheduler(); + #else + nvexec::stream_context stream_ctx{}; + auto scheduler = stream_ctx.get_scheduler(); + #endif initialize(conf, scheduler, x, y, u, un); auto start = std::chrono::high_resolution_clock::now(); diff --git a/tutorial/05_heat2d/heat2D.hpp b/tutorial/05_heat2d/heat2D.hpp index e803e0a..11df83a 100644 --- a/tutorial/05_heat2d/heat2D.hpp +++ b/tutorial/05_heat2d/heat2D.hpp @@ -222,6 +222,12 @@ static void report_performance(const Config& conf, double seconds) { // 7 Flop per iteration const double GFlops = static_cast(n) * static_cast(conf.nbiter_) * 7 / 1.e9; + #if defined(ENABLE_OPENMP) + std::cout << "OpenMP backend with " << std::thread::hardware_concurrency() << " threads" << std::endl; + #else + std::cout << "CUDA backend" << std::endl; + #endif + std::cout << "Elapsed time: " << seconds << " [s]" << std::endl; std::cout << "Bandwidth: " << GBytes / seconds << " [GB/s]" << std::endl; std::cout << "Flops: " << GFlops / seconds << " [GFlops]" << std::endl; diff --git a/tutorial/05_heat2d/types.hpp b/tutorial/05_heat2d/types.hpp index c14b996..2642360 100644 --- a/tutorial/05_heat2d/types.hpp +++ b/tutorial/05_heat2d/types.hpp @@ -1,7 +1,6 @@ #ifndef __TYPES_HPP__ #define __TYPES_HPP__ -#include #include #include @@ -11,7 +10,7 @@ namespace stdex = std::experimental; #if defined(_NVHPC_CUDA) || defined(__CUDACC__) using default_layout = stdex::layout_left; #else - using default_layout = stdex::layout_right; + using default_layout = stdex::layout_left; #endif using RealView1D = stdex::mdspan, default_layout>; From 4161b201687d0d21f1506f52e56f484f9d8f69a6 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Wed, 2 Aug 2023 22:03:35 +0900 Subject: [PATCH 2/2] Add job scripts for OpenMP backend for executor implementations --- ...xecutors_heat3d_mpi_time_stamps_Icelake.sh | 8 +++- wk/sub_executors_tutorial_Icelake.sh | 45 +++++++++++++++++++ 2 files changed, 51 insertions(+), 2 deletions(-) create mode 100644 wk/sub_executors_tutorial_Icelake.sh diff --git a/wk/sub_executors_heat3d_mpi_time_stamps_Icelake.sh b/wk/sub_executors_heat3d_mpi_time_stamps_Icelake.sh index 4d83a9c..cdcc2cf 100644 --- a/wk/sub_executors_heat3d_mpi_time_stamps_Icelake.sh +++ b/wk/sub_executors_heat3d_mpi_time_stamps_Icelake.sh @@ -36,10 +36,14 @@ export UCX_IB_GPU_DIRECT_RDMA=no export OMP_NUM_THREADS=36 export OMP_PROC_BIND=true -mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC --map-by ppr:1:socket:PE=${OMP_NUM_THREADS} \ +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC \ ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 2 --nx 512 --ny 512 --nz 256 --nbiter 100 --freq_diag 0 --use_time_stamps 1 -mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC --map-by ppr:1:socket:PE=${OMP_NUM_THREADS} \ +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC \ ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 2 --nx 512 --ny 512 --nz 256 --nbiter 100 --freq_diag 0 --use_time_stamps 1 --is_async 1 +##mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC --map-by ppr:1:socket:PE=${OMP_NUM_THREADS} \ +## ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 2 --nx 512 --ny 512 --nz 256 --nbiter 100 --freq_diag 0 --use_time_stamps 1 +##mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC --map-by ppr:1:socket:PE=${OMP_NUM_THREADS} \ +## ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 2 --nx 512 --ny 512 --nz 256 --nbiter 100 --freq_diag 0 --use_time_stamps 1 --is_async 1 #mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 4 \ # ./wrapper.sh ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 4 --nx 1024 --ny 1024 --nz 256 --nbiter 1000 --freq_diag 0 --use_time_stamps 1 diff --git a/wk/sub_executors_tutorial_Icelake.sh b/wk/sub_executors_tutorial_Icelake.sh new file mode 100644 index 0000000..1b5b083 --- /dev/null +++ b/wk/sub_executors_tutorial_Icelake.sh @@ -0,0 +1,45 @@ +#!/bin/bash +#PJM -L "node=1" +#PJM -L "rscgrp=regular-a" +#PJM -L "elapse=10:00" +#PJM -s +#PJM -g jh220031a +#PJM --mpi proc=1 + +. /etc/profile.d/modules.sh # Initialize module command + +module purge + +# Load spack +export HOME=/work/jh220031a/i18048 +. $HOME/spack/share/spack/setup-env.sh + +spack load gcc@11.3.0 +spack load cmake@3.24.3%gcc@8.3.1 +module load /work/04/jh220031a/i18048/lib/nvidia/hpc_sdk23.3/modulefiles/nvhpc/23.3 +module list + +# Need GPUs to build the code appropriately +# So compile inside a batch job, wherein GPUs are visible +if [ ! -d "../build" ] +then + cd ../ + rm -rf build + mkdir build && cd build + cmake -DCMAKE_CXX_COMPILER=nvc++ -DBACKEND=OPENMP .. + cmake --build . -j 8 + cd ../wk/ +fi + +export UCX_MEMTYPE_CACHE=n +export UCX_IB_GPU_DIRECT_RDMA=no +export UCX_RNDV_FRAG_MEM_TYPE=cuda +export OMP_NUM_THREADS=36 +export OMP_PROC_BIND=true + +# STREAM +../build/tutorial/04_stream/04_stream_test + +# Heat2D +../build/tutorial/05_heat2d/05_heat_test +