From 9cd97ac39f4816541df595893acce64fe7308695 Mon Sep 17 00:00:00 2001
From: Yuuichi Asahi <y.asahi@nr.titech.ac.jp>
Date: Wed, 2 Aug 2023 22:02:38 +0900
Subject: [PATCH 1/2] Add OpenMP backend for executor implementations

---
 mini-apps/heat3d-mpi/executors/grid.hpp     | 21 ++++++---
 mini-apps/heat3d-mpi/executors/heat3D.hpp   |  6 +++
 mini-apps/heat3d-mpi/executors/mpi_comm.hpp | 15 ++++--
 mini-apps/heat3d-mpi/executors/variable.hpp | 10 +++-
 mini-apps/heat3d-mpi/stdpar/heat3D.hpp      |  6 +++
 mini-apps/heat3d/executors/CMakeLists.txt   |  7 +++
 mini-apps/heat3d/executors/grid.hpp         | 23 ++++++---
 mini-apps/heat3d/executors/heat3D.cpp       | 17 +++++--
 mini-apps/heat3d/executors/heat3D.hpp       |  6 +++
 mini-apps/heat3d/executors/variable.hpp     |  9 +++-
 mini-apps/heat3d/stdpar/heat3D.hpp          |  6 +++
 tutorial/04_stream/CMakeLists.txt           |  7 +++
 tutorial/04_stream/stream.cpp               | 39 ++++++++++++----
 tutorial/04_stream/stream.hpp               | 52 +++++++++++++++------
 tutorial/05_heat2d/CMakeLists.txt           |  7 +++
 tutorial/05_heat2d/heat2D.cpp               | 26 +++++++----
 tutorial/05_heat2d/heat2D.hpp               |  6 +++
 tutorial/05_heat2d/types.hpp                |  3 +-
 18 files changed, 209 insertions(+), 57 deletions(-)
diff --git a/mini-apps/heat3d-mpi/executors/grid.hpp b/mini-apps/heat3d-mpi/executors/grid.hpp
index 81b5e4a..ffa822a 100644
--- a/mini-apps/heat3d-mpi/executors/grid.hpp
+++ b/mini-apps/heat3d-mpi/executors/grid.hpp
@@ -2,19 +2,24 @@
 #define __GRID_HPP__
 
 #include <cmath>
+#include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
 #include <thrust/sequence.h>
 #include "../types.hpp"
 #include "../config.hpp"
 
 template <typename ScalarType>
-inline thrust::device_vector<ScalarType> arange(const ScalarType start,
-                                                const ScalarType stop,
-                                                const ScalarType step=1
-                                               ) {
+inline auto arange(const ScalarType start,
+                   const ScalarType stop,
+                   const ScalarType step=1
+                  ) {
   const size_t length = ceil((stop - start) / step);
 
-  thrust::device_vector<ScalarType> result(length);
+  #if defined(ENABLE_OPENMP)
+    thrust::host_vector<ScalarType> result(length);
+  #else
+    thrust::device_vector<ScalarType> result(length);
+  #endif
 
   ScalarType delta = (stop - start) / length;
   thrust::sequence(result.begin(), result.end(), start, delta);
@@ -27,7 +32,11 @@ struct Grid {
   using Shape1D = shape_type<1>;
 
 private:
-  thrust::device_vector<RealType> x_, y_, z_;
+  #if defined(ENABLE_OPENMP)
+    thrust::host_vector<RealType> x_, y_, z_;
+  #else
+    thrust::device_vector<RealType> x_, y_, z_;
+  #endif
   Shape1D extents_nx_, extents_ny_, extents_nz_;
 
 public:
diff --git a/mini-apps/heat3d-mpi/executors/heat3D.hpp b/mini-apps/heat3d-mpi/executors/heat3D.hpp
index 349e0ea..923a982 100644
--- a/mini-apps/heat3d-mpi/executors/heat3D.hpp
+++ b/mini-apps/heat3d-mpi/executors/heat3D.hpp
@@ -167,6 +167,12 @@ static void report_performance(const Config& conf, double seconds) {
   // 9 Flop per iteration
   double GFlops = static_cast<double>(n) * static_cast<double>(conf.nbiter_) * 9 / 1.e9;
 
+  #if defined(ENABLE_OPENMP)
+    std::cout << "OpenMP backend with " << std::thread::hardware_concurrency() << " threads" << std::endl;
+  #else
+    std::cout << "CUDA backend" << std::endl;
+  #endif
+
   if(conf.is_async_) {
     std::cout << "Communication and Computation Overlap" << std::endl;
   }
diff --git a/mini-apps/heat3d-mpi/executors/mpi_comm.hpp b/mini-apps/heat3d-mpi/executors/mpi_comm.hpp
index 53feeea..e888be4 100644
--- a/mini-apps/heat3d-mpi/executors/mpi_comm.hpp
+++ b/mini-apps/heat3d-mpi/executors/mpi_comm.hpp
@@ -3,6 +3,7 @@
 
 #include <cassert>
 #include <vector>
+#include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
 #include <complex>
 #include <mpi.h>
@@ -45,7 +46,11 @@ template <typename RealType>
 struct Halo {
   using RealView2D = View2D<RealType>;
   using Shpae2D = shape_type<2>;
-  using Vector = thrust::device_vector<RealType>;
+  #if defined(ENABLE_OPENMP)
+    using Vector = thrust::host_vector<RealType>;
+  #else
+    using Vector = thrust::device_vector<RealType>;
+  #endif
 
 private:
   Vector left_, right_;
@@ -393,10 +398,10 @@ class Comm {
 
       MPI_Waitall( 4, request, status );
     } else {
-      thrust::device_vector<double>& send_left_vector  = send.left_vector(); 
-      thrust::device_vector<double>& send_right_vector = send.right_vector(); 
-      thrust::device_vector<double>& recv_left_vector  = recv.left_vector(); 
-      thrust::device_vector<double>& recv_right_vector = recv.right_vector(); 
+      auto& send_left_vector  = send.left_vector();
+      auto& send_right_vector = send.right_vector();
+      auto& recv_left_vector  = recv.left_vector();
+      auto& recv_right_vector = recv.right_vector();
 
       thrust::swap( send_left_vector,  recv_right_vector );
       thrust::swap( send_right_vector, recv_left_vector  );
diff --git a/mini-apps/heat3d-mpi/executors/variable.hpp b/mini-apps/heat3d-mpi/executors/variable.hpp
index d07a558..94e32d6 100644
--- a/mini-apps/heat3d-mpi/executors/variable.hpp
+++ b/mini-apps/heat3d-mpi/executors/variable.hpp
@@ -14,8 +14,14 @@ struct Variable {
 
   using Shape1D = shape_type<1>;
   using Shape3D = shape_type<3>;
-  thrust::device_vector<RealType> u_, un_;
-  thrust::device_vector<RealType> x_mask_, y_mask_, z_mask_;
+
+  #if defined(ENABLE_OPENMP)
+    using Vector = thrust::host_vector<RealType>;
+  #else
+    using Vector = thrust::device_vector<RealType>;
+  #endif
+  Vector u_, un_;
+  Vector x_mask_, y_mask_, z_mask_;
   Shape1D extents_x_, extents_y_, extents_z_;
   Shape3D extents3D_;
 
diff --git a/mini-apps/heat3d-mpi/stdpar/heat3D.hpp b/mini-apps/heat3d-mpi/stdpar/heat3D.hpp
index d74750b..3cb2826 100644
--- a/mini-apps/heat3d-mpi/stdpar/heat3D.hpp
+++ b/mini-apps/heat3d-mpi/stdpar/heat3D.hpp
@@ -206,6 +206,12 @@ static void report_performance(const Config& conf, double seconds) {
   // 9 Flop per iteration
   const double GFlops = static_cast<double>(n) * static_cast<double>(conf.nbiter_) * 9 / 1.e9;
 
+  #if defined(ENABLE_OPENMP)
+    std::cout << "OpenMP backend" << std::endl;
+  #else
+    std::cout << "CUDA backend" << std::endl;
+  #endif
+
   std::cout << "Elapsed time: " << seconds << " [s]" << std::endl;
   std::cout << "Bandwidth: " << GBytes / seconds << " [GB/s]" << std::endl;
   std::cout << "Flops: " << GFlops / seconds << " [GFlops]" << std::endl;
diff --git a/mini-apps/heat3d/executors/CMakeLists.txt b/mini-apps/heat3d/executors/CMakeLists.txt
index 998777b..5ba9354 100644
--- a/mini-apps/heat3d/executors/CMakeLists.txt
+++ b/mini-apps/heat3d/executors/CMakeLists.txt
@@ -1,3 +1,10 @@
 add_executable(heat3d-exec heat3D.cpp)
 target_link_libraries(heat3d-exec PUBLIC Threads::Threads STDEXEC::stdexec STDEXEC::nvexec)
 target_compile_features(heat3d-exec PUBLIC cxx_std_20)
+
+set(BACKEND AUTO CACHE STRING "CHOICE OF PARALLEL BACKEND")
+if(BACKEND STREQUAL "OPENMP")
+    find_package(OpenMP REQUIRED)
+    target_link_libraries(heat3d-exec PUBLIC OpenMP::OpenMP_CXX)
+    target_compile_definitions(heat3d-exec PUBLIC ENABLE_OPENMP THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_OMP)
+endif()
diff --git a/mini-apps/heat3d/executors/grid.hpp b/mini-apps/heat3d/executors/grid.hpp
index 80b04a4..2656093 100644
--- a/mini-apps/heat3d/executors/grid.hpp
+++ b/mini-apps/heat3d/executors/grid.hpp
@@ -3,18 +3,23 @@
 
 #include <cmath>
 #include <thrust/sequence.h>
+#include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
 #include "../types.hpp"
 #include "../config.hpp"
 
 template <typename ScalarType>
-inline thrust::device_vector<ScalarType> arange(const ScalarType start,
-                                                const ScalarType stop,
-                                                const ScalarType step=1
-                                               ) {
+inline auto arange(const ScalarType start,
+                   const ScalarType stop,
+                   const ScalarType step=1
+                  ) {
   const size_t length = ceil((stop - start) / step);
 
-  thrust::device_vector<ScalarType> result(length);
+  #if defined(ENABLE_OPENMP)
+    thrust::host_vector<ScalarType> result(length);
+  #else
+    thrust::device_vector<ScalarType> result(length);
+  #endif
 
   ScalarType delta = (stop - start) / length;
   thrust::sequence(result.begin(), result.end(), start, delta);
@@ -26,8 +31,14 @@ struct Grid {
   using RealView1D = View1D<RealType>;
   using Shape1D = shape_type<1>;
 
+  #if defined(ENABLE_OPENMP)
+    using Vector = thrust::host_vector<RealType>;
+  #else
+    using Vector = thrust::device_vector<RealType>;
+  #endif
+
 private:
-  thrust::device_vector<RealType> x_, y_, z_;
+  Vector x_, y_, z_;
   Shape1D extents_nx_, extents_ny_, extents_nz_;
 
 public:
diff --git a/mini-apps/heat3d/executors/heat3D.cpp b/mini-apps/heat3d/executors/heat3D.cpp
index eef311e..b5b33e0 100644
--- a/mini-apps/heat3d/executors/heat3D.cpp
+++ b/mini-apps/heat3d/executors/heat3D.cpp
@@ -1,11 +1,16 @@
 #include <chrono>
-#include "nvexec/stream_context.cuh"
 #include "../config.hpp"
 #include "../parser.hpp"
 #include "heat3D.hpp"
 #include "variable.hpp"
 #include "grid.hpp"
 
+#if defined(ENABLE_OPENMP)
+  #include <exec/static_thread_pool.hpp>
+#else
+  #include "nvexec/stream_context.cuh"
+#endif
+
 int main(int argc, char *argv[]) {
   Parser parser(argc, argv);
   auto nx = parser.shape_[0];
@@ -18,9 +23,13 @@ int main(int argc, char *argv[]) {
   Grid<double> grid(conf);
   Variable<double> variables(conf);
 
-  // Declare a CUDA stream
-  nvexec::stream_context stream_ctx{};
-  auto scheduler = stream_ctx.get_scheduler();
+  #if defined(ENABLE_OPENMP)
+    exec::static_thread_pool pool{std::thread::hardware_concurrency()};
+    auto scheduler = pool.get_scheduler();
+  #else
+    nvexec::stream_context stream_ctx{};
+    auto scheduler = stream_ctx.get_scheduler();
+  #endif
 
   initialize(conf, grid, scheduler, variables);
   auto start = std::chrono::high_resolution_clock::now();
diff --git a/mini-apps/heat3d/executors/heat3D.hpp b/mini-apps/heat3d/executors/heat3D.hpp
index 04acd50..83c1206 100644
--- a/mini-apps/heat3d/executors/heat3D.hpp
+++ b/mini-apps/heat3d/executors/heat3D.hpp
@@ -206,6 +206,12 @@ static void report_performance(const Config& conf, double seconds) {
   // 9 Flop per iteration
   const double GFlops = static_cast<double>(n) * static_cast<double>(conf.nbiter_) * 9 / 1.e9;
 
+  #if defined(ENABLE_OPENMP)
+    std::cout << "OpenMP backend with " << std::thread::hardware_concurrency() << " threads" << std::endl;
+  #else
+    std::cout << "CUDA backend" << std::endl;
+  #endif
+
   std::cout << "Elapsed time: " << seconds << " [s]" << std::endl;
   std::cout << "Bandwidth: " << GBytes / seconds << " [GB/s]" << std::endl;
   std::cout << "Flops: " << GFlops / seconds << " [GFlops]" << std::endl;
diff --git a/mini-apps/heat3d/executors/variable.hpp b/mini-apps/heat3d/executors/variable.hpp
index eabc60b..16ccbc8 100644
--- a/mini-apps/heat3d/executors/variable.hpp
+++ b/mini-apps/heat3d/executors/variable.hpp
@@ -1,6 +1,7 @@
 #ifndef __VARIABLE_HPP__
 #define __VARIABLE_HPP__
 
+#include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
 #include "../types.hpp"
 #include "../config.hpp"
@@ -10,7 +11,13 @@ struct Variable {
 private:
   using RealView3D = View3D<RealType>;
   using Shape3D = shape_type<3>;
-  thrust::device_vector<RealType> u_, un_;
+  #if defined(ENABLE_OPENMP)
+    using Vector = thrust::host_vector<RealType>;
+  #else
+    using Vector = thrust::device_vector<RealType>;
+  #endif
+
+  Vector u_, un_;
   Shape3D extents3D_;
 
 public:
diff --git a/mini-apps/heat3d/stdpar/heat3D.hpp b/mini-apps/heat3d/stdpar/heat3D.hpp
index 95b6104..71f9326 100644
--- a/mini-apps/heat3d/stdpar/heat3D.hpp
+++ b/mini-apps/heat3d/stdpar/heat3D.hpp
@@ -192,6 +192,12 @@ static void report_performance(const Config& conf, double seconds) {
   // 9 Flop per iteration
   const double GFlops = static_cast<double>(n) * static_cast<double>(conf.nbiter_) * 9 / 1.e9;
 
+  #if defined(ENABLE_OPENMP)
+    std::cout << "OpenMP backend" << std::endl;
+  #else
+    std::cout << "CUDA backend" << std::endl;
+  #endif
+
   std::cout << "Elapsed time: " << seconds << " [s]" << std::endl;
   std::cout << "Bandwidth: " << GBytes / seconds << " [GB/s]" << std::endl;
   std::cout << "Flops: " << GFlops / seconds << " [GFlops]" << std::endl;
diff --git a/tutorial/04_stream/CMakeLists.txt b/tutorial/04_stream/CMakeLists.txt
index 612bc0d..447a326 100644
--- a/tutorial/04_stream/CMakeLists.txt
+++ b/tutorial/04_stream/CMakeLists.txt
@@ -1,3 +1,10 @@
 add_executable(04_stream_test stream.cpp)
 target_link_libraries(04_stream_test PUBLIC Threads::Threads STDEXEC::stdexec STDEXEC::nvexec)
 target_compile_features(04_stream_test PUBLIC cxx_std_20)
+
+set(BACKEND AUTO CACHE STRING "CHOICE OF PARALLEL BACKEND")
+if(BACKEND STREQUAL "OPENMP")
+    find_package(OpenMP REQUIRED)
+    target_link_libraries(04_stream_test PUBLIC OpenMP::OpenMP_CXX)
+    target_compile_definitions(04_stream_test PUBLIC ENABLE_OPENMP THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_OMP)
+endif()
diff --git a/tutorial/04_stream/stream.cpp b/tutorial/04_stream/stream.cpp
index ce8e744..02f3f0f 100644
--- a/tutorial/04_stream/stream.cpp
+++ b/tutorial/04_stream/stream.cpp
@@ -3,15 +3,23 @@
 #include <algorithm>
 #include <numeric>
 #include <stdexec/execution.hpp>
+#include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
 #include <thrust/transform_reduce.h>
 #include <thrust/execution_policy.h>
+#include <exec/static_thread_pool.hpp>
 #include "nvexec/stream_context.cuh"
-#include "exec/on.hpp"
+#include <exec/on.hpp>
 #include "stream.hpp"
 
 using counting_iterator = thrust::counting_iterator<std::size_t>;
 
+#if defined(ENABLE_OPENMP)
+  using Vector = thrust::host_vector<double>;
+#else
+  using Vector = thrust::device_vector<double>;
+#endif
+
 constexpr std::size_t ARRAY_SIZE = 128*128*128*128;
 constexpr std::size_t nbiter = 100;
 constexpr double start_A = 0.1;
@@ -60,9 +68,9 @@ void average(const std::size_t n, UnarayOperation const unary_op, OutputType &re
 
 template <typename RealType>
 void checkSolution(const std::size_t nbiter,
-                   const thrust::device_vector<RealType>& a,
-                   const thrust::device_vector<RealType>& b,
-                   const thrust::device_vector<RealType>& c,
+                   const Vector& a,
+                   const Vector& b,
+                   const Vector& c,
                    const RealType& sum) {
   // Generate correct solution
   RealType gold_A = start_A;
@@ -128,14 +136,19 @@ void checkSolution(const std::size_t nbiter,
 };
 
 int main(int argc, char *argv[]) {
-  // Declare a CUDA stream
-  nvexec::stream_context stream_ctx{};
-  auto scheduler = stream_ctx.get_scheduler();
+  #if defined(ENABLE_OPENMP)
+    exec::static_thread_pool pool{std::thread::hardware_concurrency()};
+    auto scheduler = pool.get_scheduler();
+  #else
+    // Declare a CUDA stream
+    nvexec::stream_context stream_ctx{};
+    auto scheduler = stream_ctx.get_scheduler();
+  #endif
 
   // Declare device vectors
-  thrust::device_vector<double> a(ARRAY_SIZE);
-  thrust::device_vector<double> b(ARRAY_SIZE);
-  thrust::device_vector<double> c(ARRAY_SIZE);
+  Vector a(ARRAY_SIZE);
+  Vector b(ARRAY_SIZE);
+  Vector c(ARRAY_SIZE);
   double* ptr_a = (double *)thrust::raw_pointer_cast(a.data());
   double* ptr_b = (double *)thrust::raw_pointer_cast(b.data());
   double sum = 0.0;
@@ -198,6 +211,12 @@ int main(int argc, char *argv[]) {
 
   checkSolution(nbiter, a, b, c, sum);
 
+  #if defined(ENABLE_OPENMP)
+    std::cout << "OpenMP backend" << std::endl;
+  #else
+    std::cout << "CUDA backend" << std::endl;
+  #endif
+
   std::cout
     << "function" << csv_separator
     << "num_times" << csv_separator
diff --git a/tutorial/04_stream/stream.hpp b/tutorial/04_stream/stream.hpp
index b4ed446..24b76a8 100644
--- a/tutorial/04_stream/stream.hpp
+++ b/tutorial/04_stream/stream.hpp
@@ -1,11 +1,17 @@
 #ifndef __STREAM_HPP__
 #define __STREAM_HPP__
 
+#include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
 #include <experimental/mdspan>
 
 template <typename RealType>
 struct init_functor{
+  #if defined(ENABLE_OPENMP)
+    using Vector = thrust::host_vector<RealType>;
+  #else
+    using Vector = thrust::device_vector<RealType>;
+  #endif
   const RealType start_A_;
   const RealType start_B_;
   const RealType start_C_;
@@ -16,9 +22,9 @@ struct init_functor{
   init_functor(const RealType start_A,
                const RealType start_B,
                const RealType start_C,
-               thrust::device_vector<RealType>& a,
-               thrust::device_vector<RealType>& b,
-               thrust::device_vector<RealType>& c) : start_A_(start_A), start_B_(start_B), start_C_(start_C) {
+               Vector& a,
+               Vector& b,
+               Vector& c) : start_A_(start_A), start_B_(start_B), start_C_(start_C) {
     ptr_a_ = (RealType *)thrust::raw_pointer_cast(a.data());
     ptr_b_ = (RealType *)thrust::raw_pointer_cast(b.data());
     ptr_c_ = (RealType *)thrust::raw_pointer_cast(c.data());
@@ -34,11 +40,16 @@ struct init_functor{
 
 template <typename RealType>
 struct copy_functor{
+  #if defined(ENABLE_OPENMP)
+    using Vector = thrust::host_vector<RealType>;
+  #else
+    using Vector = thrust::device_vector<RealType>;
+  #endif
   RealType *ptr_a_;
   RealType *ptr_c_;
 
-  copy_functor(const thrust::device_vector<RealType>& a,
-               thrust::device_vector<RealType>& c) {
+  copy_functor(const Vector& a,
+               Vector& c) {
     ptr_a_ = (RealType *)thrust::raw_pointer_cast(a.data());
     ptr_c_ = (RealType *)thrust::raw_pointer_cast(c.data());
   }
@@ -51,13 +62,18 @@ struct copy_functor{
 
 template <typename RealType>
 struct mul_functor{
+  #if defined(ENABLE_OPENMP)
+    using Vector = thrust::host_vector<RealType>;
+  #else
+    using Vector = thrust::device_vector<RealType>;
+  #endif
   const RealType scalar_;
   RealType *ptr_b_;
   RealType *ptr_c_;
 
   mul_functor(const RealType scalar,
-              const thrust::device_vector<RealType>& b,
-              thrust::device_vector<RealType>& c) : scalar_(scalar) {
+              const Vector& b,
+              Vector& c) : scalar_(scalar) {
     ptr_b_ = (RealType *)thrust::raw_pointer_cast(b.data());
     ptr_c_ = (RealType *)thrust::raw_pointer_cast(c.data());
   }
@@ -70,13 +86,18 @@ struct mul_functor{
 
 template <typename RealType>
 struct add_functor{
+  #if defined(ENABLE_OPENMP)
+    using Vector = thrust::host_vector<RealType>;
+  #else
+    using Vector = thrust::device_vector<RealType>;
+  #endif
   RealType *ptr_a_;
   RealType *ptr_b_;
   RealType *ptr_c_;
 
-  add_functor(const thrust::device_vector<RealType>& a,
-              const thrust::device_vector<RealType>& b,
-              thrust::device_vector<RealType>& c) {
+  add_functor(const Vector& a,
+              const Vector& b,
+              Vector& c) {
     ptr_a_ = (RealType *)thrust::raw_pointer_cast(a.data());
     ptr_b_ = (RealType *)thrust::raw_pointer_cast(b.data());
     ptr_c_ = (RealType *)thrust::raw_pointer_cast(c.data());
@@ -90,15 +111,20 @@ struct add_functor{
 
 template <typename RealType>
 struct triad_functor{
+  #if defined(ENABLE_OPENMP)
+    using Vector = thrust::host_vector<RealType>;
+  #else
+    using Vector = thrust::device_vector<RealType>;
+  #endif
   const RealType scalar_;
   RealType *ptr_a_;
   RealType *ptr_b_;
   RealType *ptr_c_;
 
   triad_functor(const RealType scalar,
-                const thrust::device_vector<RealType>& a,
-                const thrust::device_vector<RealType>& b,
-                thrust::device_vector<RealType>& c) : scalar_(scalar) {
+                const Vector& a,
+                const Vector& b,
+                Vector& c) : scalar_(scalar) {
     ptr_a_ = (RealType *)thrust::raw_pointer_cast(a.data());
     ptr_b_ = (RealType *)thrust::raw_pointer_cast(b.data());
     ptr_c_ = (RealType *)thrust::raw_pointer_cast(c.data());
diff --git a/tutorial/05_heat2d/CMakeLists.txt b/tutorial/05_heat2d/CMakeLists.txt
index ceb485f..8b7a38c 100644
--- a/tutorial/05_heat2d/CMakeLists.txt
+++ b/tutorial/05_heat2d/CMakeLists.txt
@@ -1,3 +1,10 @@
 add_executable(05_heat_test heat2D.cpp)
 target_link_libraries(05_heat_test PUBLIC Threads::Threads STDEXEC::stdexec STDEXEC::nvexec)
 target_compile_features(05_heat_test PUBLIC cxx_std_20)
+
+set(BACKEND AUTO CACHE STRING "CHOICE OF PARALLEL BACKEND")
+if(BACKEND STREQUAL "OPENMP")
+    find_package(OpenMP REQUIRED)
+    target_link_libraries(05_heat_test PUBLIC OpenMP::OpenMP_CXX)
+    target_compile_definitions(05_heat_test PUBLIC ENABLE_OPENMP THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_OMP)
+endif()
diff --git a/tutorial/05_heat2d/heat2D.cpp b/tutorial/05_heat2d/heat2D.cpp
index 52e9a84..76ae0de 100644
--- a/tutorial/05_heat2d/heat2D.cpp
+++ b/tutorial/05_heat2d/heat2D.cpp
@@ -7,19 +7,25 @@
 #include <thrust/device_vector.h>
 #include <thrust/transform_reduce.h>
 #include <thrust/execution_policy.h>
-#include "nvexec/stream_context.cuh"
+#if defined(ENABLE_OPENMP)
+  #include <exec/static_thread_pool.hpp>
+#else
+  #include "nvexec/stream_context.cuh"
+#endif
 #include "exec/on.hpp"
 #include "heat2D.hpp"
 
+using Vector = thrust::device_vector<double>;
+
 int main(int argc, char *argv[]) {
   // Set configuration
   Config conf;
 
   // Declare grid and values
-  thrust::device_vector<double> _x(conf.nx_);
-  thrust::device_vector<double> _y(conf.ny_);
-  thrust::device_vector<double> _u(conf.nx_ * conf.ny_);
-  thrust::device_vector<double> _un(conf.nx_ * conf.ny_);
+  Vector _x(conf.nx_);
+  Vector _y(conf.ny_);
+  Vector _u(conf.nx_ * conf.ny_);
+  Vector _un(conf.nx_ * conf.ny_);
 
   // Viewed to mdspans
   RealView1D x( (double *)thrust::raw_pointer_cast(_x.data()), std::array<std::size_t, 1>({conf.nx_}) );
@@ -27,9 +33,13 @@ int main(int argc, char *argv[]) {
   RealView2D u( (double *)thrust::raw_pointer_cast(_u.data()), std::array<std::size_t, 2>({conf.nx_, conf.ny_}) );
   RealView2D un( (double *)thrust::raw_pointer_cast(_un.data()), std::array<std::size_t, 2>({conf.nx_, conf.ny_}) );
 
-  // Declare a CUDA stream
-  nvexec::stream_context stream_ctx{};
-  auto scheduler = stream_ctx.get_scheduler();
+  #if defined(ENABLE_OPENMP)
+    exec::static_thread_pool pool{std::thread::hardware_concurrency()};
+    auto scheduler = pool.get_scheduler();
+  #else
+    nvexec::stream_context stream_ctx{};
+    auto scheduler = stream_ctx.get_scheduler();
+  #endif
 
   initialize(conf, scheduler, x, y, u, un);
   auto start = std::chrono::high_resolution_clock::now();
diff --git a/tutorial/05_heat2d/heat2D.hpp b/tutorial/05_heat2d/heat2D.hpp
index e803e0a..11df83a 100644
--- a/tutorial/05_heat2d/heat2D.hpp
+++ b/tutorial/05_heat2d/heat2D.hpp
@@ -222,6 +222,12 @@ static void report_performance(const Config& conf, double seconds) {
   // 7 Flop per iteration
   const double GFlops = static_cast<double>(n) * static_cast<double>(conf.nbiter_) * 7 / 1.e9;
 
+  #if defined(ENABLE_OPENMP)
+    std::cout << "OpenMP backend with " << std::thread::hardware_concurrency() << " threads" << std::endl;
+  #else
+    std::cout << "CUDA backend" << std::endl;
+  #endif
+
   std::cout << "Elapsed time: " << seconds << " [s]" << std::endl;
   std::cout << "Bandwidth: " << GBytes / seconds << " [GB/s]" << std::endl;
   std::cout << "Flops: " << GFlops / seconds << " [GFlops]" << std::endl;
diff --git a/tutorial/05_heat2d/types.hpp b/tutorial/05_heat2d/types.hpp
index c14b996..2642360 100644
--- a/tutorial/05_heat2d/types.hpp
+++ b/tutorial/05_heat2d/types.hpp
@@ -1,7 +1,6 @@
 #ifndef __TYPES_HPP__
 #define __TYPES_HPP__
 
-#include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <experimental/mdspan>
 
@@ -11,7 +10,7 @@ namespace stdex = std::experimental;
 #if defined(_NVHPC_CUDA) || defined(__CUDACC__)
   using default_layout = stdex::layout_left;
 #else
-  using default_layout = stdex::layout_right;
+  using default_layout = stdex::layout_left;
 #endif
 
 using RealView1D = stdex::mdspan<double, stdex::dextents<std::size_t, 1>, default_layout>;

From 4161b201687d0d21f1506f52e56f484f9d8f69a6 Mon Sep 17 00:00:00 2001
From: Yuuichi Asahi <y.asahi@nr.titech.ac.jp>
Date: Wed, 2 Aug 2023 22:03:35 +0900
Subject: [PATCH 2/2] Add job scripts for OpenMP backend for executor
 implementations

---
 ...xecutors_heat3d_mpi_time_stamps_Icelake.sh |  8 +++-
 wk/sub_executors_tutorial_Icelake.sh          | 45 +++++++++++++++++++
 2 files changed, 51 insertions(+), 2 deletions(-)
 create mode 100644 wk/sub_executors_tutorial_Icelake.sh

diff --git a/wk/sub_executors_heat3d_mpi_time_stamps_Icelake.sh b/wk/sub_executors_heat3d_mpi_time_stamps_Icelake.sh
index 4d83a9c..cdcc2cf 100644
--- a/wk/sub_executors_heat3d_mpi_time_stamps_Icelake.sh
+++ b/wk/sub_executors_heat3d_mpi_time_stamps_Icelake.sh
@@ -36,10 +36,14 @@ export UCX_IB_GPU_DIRECT_RDMA=no
 export OMP_NUM_THREADS=36
 export OMP_PROC_BIND=true
 
-mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC --map-by ppr:1:socket:PE=${OMP_NUM_THREADS} \
+mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC \
     ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 2 --nx 512 --ny 512 --nz 256 --nbiter 100 --freq_diag 0 --use_time_stamps 1
-mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC --map-by ppr:1:socket:PE=${OMP_NUM_THREADS} \
+mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC \
     ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 2 --nx 512 --ny 512 --nz 256 --nbiter 100 --freq_diag 0 --use_time_stamps 1 --is_async 1
+##mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC --map-by ppr:1:socket:PE=${OMP_NUM_THREADS} \
+##    ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 2 --nx 512 --ny 512 --nz 256 --nbiter 100 --freq_diag 0 --use_time_stamps 1
+##mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC --map-by ppr:1:socket:PE=${OMP_NUM_THREADS} \
+##    ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 2 --nx 512 --ny 512 --nz 256 --nbiter 100 --freq_diag 0 --use_time_stamps 1 --is_async 1
 
 #mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 4 \
 #    ./wrapper.sh ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 4 --nx 1024 --ny 1024 --nz 256 --nbiter 1000 --freq_diag 0 --use_time_stamps 1
diff --git a/wk/sub_executors_tutorial_Icelake.sh b/wk/sub_executors_tutorial_Icelake.sh
new file mode 100644
index 0000000..1b5b083
--- /dev/null
+++ b/wk/sub_executors_tutorial_Icelake.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+#PJM -L "node=1"
+#PJM -L "rscgrp=regular-a"
+#PJM -L "elapse=10:00"
+#PJM -s
+#PJM -g jh220031a
+#PJM --mpi proc=1
+
+. /etc/profile.d/modules.sh # Initialize module command
+
+module purge
+
+# Load spack
+export HOME=/work/jh220031a/i18048
+. $HOME/spack/share/spack/setup-env.sh
+
+spack load gcc@11.3.0
+spack load cmake@3.24.3%gcc@8.3.1
+module load /work/04/jh220031a/i18048/lib/nvidia/hpc_sdk23.3/modulefiles/nvhpc/23.3
+module list
+
+# Need GPUs to build the code appropriately
+# So compile inside a batch job, wherein GPUs are visible
+if [ ! -d "../build" ]
+then
+    cd ../
+    rm -rf build
+    mkdir build && cd build
+    cmake -DCMAKE_CXX_COMPILER=nvc++ -DBACKEND=OPENMP ..
+    cmake --build . -j 8
+    cd ../wk/
+fi
+
+export UCX_MEMTYPE_CACHE=n
+export UCX_IB_GPU_DIRECT_RDMA=no
+export UCX_RNDV_FRAG_MEM_TYPE=cuda
+export OMP_NUM_THREADS=36
+export OMP_PROC_BIND=true
+
+# STREAM
+../build/tutorial/04_stream/04_stream_test
+
+# Heat2D
+../build/tutorial/05_heat2d/05_heat_test
+