Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OpenMP backend for executors #29

Merged
merged 2 commits into from
Aug 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 15 additions & 6 deletions mini-apps/heat3d-mpi/executors/grid.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,24 @@
#define __GRID_HPP__

#include <cmath>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sequence.h>
#include "../types.hpp"
#include "../config.hpp"

template <typename ScalarType>
inline thrust::device_vector<ScalarType> arange(const ScalarType start,
const ScalarType stop,
const ScalarType step=1
) {
inline auto arange(const ScalarType start,
const ScalarType stop,
const ScalarType step=1
) {
const size_t length = ceil((stop - start) / step);

thrust::device_vector<ScalarType> result(length);
#if defined(ENABLE_OPENMP)
thrust::host_vector<ScalarType> result(length);
#else
thrust::device_vector<ScalarType> result(length);
#endif

ScalarType delta = (stop - start) / length;
thrust::sequence(result.begin(), result.end(), start, delta);
Expand All @@ -27,7 +32,11 @@ struct Grid {
using Shape1D = shape_type<1>;

private:
thrust::device_vector<RealType> x_, y_, z_;
#if defined(ENABLE_OPENMP)
thrust::host_vector<RealType> x_, y_, z_;
#else
thrust::device_vector<RealType> x_, y_, z_;
#endif
Shape1D extents_nx_, extents_ny_, extents_nz_;

public:
Expand Down
6 changes: 6 additions & 0 deletions mini-apps/heat3d-mpi/executors/heat3D.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,12 @@ static void report_performance(const Config& conf, double seconds) {
// 9 Flop per iteration
double GFlops = static_cast<double>(n) * static_cast<double>(conf.nbiter_) * 9 / 1.e9;

#if defined(ENABLE_OPENMP)
std::cout << "OpenMP backend with " << std::thread::hardware_concurrency() << " threads" << std::endl;
#else
std::cout << "CUDA backend" << std::endl;
#endif

if(conf.is_async_) {
std::cout << "Communication and Computation Overlap" << std::endl;
}
Expand Down
15 changes: 10 additions & 5 deletions mini-apps/heat3d-mpi/executors/mpi_comm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include <cassert>
#include <vector>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <complex>
#include <mpi.h>
Expand Down Expand Up @@ -45,7 +46,11 @@ template <typename RealType>
struct Halo {
using RealView2D = View2D<RealType>;
using Shpae2D = shape_type<2>;
using Vector = thrust::device_vector<RealType>;
#if defined(ENABLE_OPENMP)
using Vector = thrust::host_vector<RealType>;
#else
using Vector = thrust::device_vector<RealType>;
#endif

private:
Vector left_, right_;
Expand Down Expand Up @@ -393,10 +398,10 @@ class Comm {

MPI_Waitall( 4, request, status );
} else {
thrust::device_vector<double>& send_left_vector = send.left_vector();
thrust::device_vector<double>& send_right_vector = send.right_vector();
thrust::device_vector<double>& recv_left_vector = recv.left_vector();
thrust::device_vector<double>& recv_right_vector = recv.right_vector();
auto& send_left_vector = send.left_vector();
auto& send_right_vector = send.right_vector();
auto& recv_left_vector = recv.left_vector();
auto& recv_right_vector = recv.right_vector();

thrust::swap( send_left_vector, recv_right_vector );
thrust::swap( send_right_vector, recv_left_vector );
Expand Down
10 changes: 8 additions & 2 deletions mini-apps/heat3d-mpi/executors/variable.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,14 @@ struct Variable {

using Shape1D = shape_type<1>;
using Shape3D = shape_type<3>;
thrust::device_vector<RealType> u_, un_;
thrust::device_vector<RealType> x_mask_, y_mask_, z_mask_;

#if defined(ENABLE_OPENMP)
using Vector = thrust::host_vector<RealType>;
#else
using Vector = thrust::device_vector<RealType>;
#endif
Vector u_, un_;
Vector x_mask_, y_mask_, z_mask_;
Shape1D extents_x_, extents_y_, extents_z_;
Shape3D extents3D_;

Expand Down
6 changes: 6 additions & 0 deletions mini-apps/heat3d-mpi/stdpar/heat3D.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,12 @@ static void report_performance(const Config& conf, double seconds) {
// 9 Flop per iteration
const double GFlops = static_cast<double>(n) * static_cast<double>(conf.nbiter_) * 9 / 1.e9;

#if defined(ENABLE_OPENMP)
std::cout << "OpenMP backend" << std::endl;
#else
std::cout << "CUDA backend" << std::endl;
#endif

std::cout << "Elapsed time: " << seconds << " [s]" << std::endl;
std::cout << "Bandwidth: " << GBytes / seconds << " [GB/s]" << std::endl;
std::cout << "Flops: " << GFlops / seconds << " [GFlops]" << std::endl;
Expand Down
7 changes: 7 additions & 0 deletions mini-apps/heat3d/executors/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
add_executable(heat3d-exec heat3D.cpp)
target_link_libraries(heat3d-exec PUBLIC Threads::Threads STDEXEC::stdexec STDEXEC::nvexec)
target_compile_features(heat3d-exec PUBLIC cxx_std_20)

set(BACKEND AUTO CACHE STRING "CHOICE OF PARALLEL BACKEND")
if(BACKEND STREQUAL "OPENMP")
find_package(OpenMP REQUIRED)
target_link_libraries(heat3d-exec PUBLIC OpenMP::OpenMP_CXX)
target_compile_definitions(heat3d-exec PUBLIC ENABLE_OPENMP THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_OMP)
endif()
23 changes: 17 additions & 6 deletions mini-apps/heat3d/executors/grid.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,23 @@

#include <cmath>
#include <thrust/sequence.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include "../types.hpp"
#include "../config.hpp"

template <typename ScalarType>
inline thrust::device_vector<ScalarType> arange(const ScalarType start,
const ScalarType stop,
const ScalarType step=1
) {
inline auto arange(const ScalarType start,
const ScalarType stop,
const ScalarType step=1
) {
const size_t length = ceil((stop - start) / step);

thrust::device_vector<ScalarType> result(length);
#if defined(ENABLE_OPENMP)
thrust::host_vector<ScalarType> result(length);
#else
thrust::device_vector<ScalarType> result(length);
#endif

ScalarType delta = (stop - start) / length;
thrust::sequence(result.begin(), result.end(), start, delta);
Expand All @@ -26,8 +31,14 @@ struct Grid {
using RealView1D = View1D<RealType>;
using Shape1D = shape_type<1>;

#if defined(ENABLE_OPENMP)
using Vector = thrust::host_vector<RealType>;
#else
using Vector = thrust::device_vector<RealType>;
#endif

private:
thrust::device_vector<RealType> x_, y_, z_;
Vector x_, y_, z_;
Shape1D extents_nx_, extents_ny_, extents_nz_;

public:
Expand Down
17 changes: 13 additions & 4 deletions mini-apps/heat3d/executors/heat3D.cpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
#include <chrono>
#include "nvexec/stream_context.cuh"
#include "../config.hpp"
#include "../parser.hpp"
#include "heat3D.hpp"
#include "variable.hpp"
#include "grid.hpp"

#if defined(ENABLE_OPENMP)
#include <exec/static_thread_pool.hpp>
#else
#include "nvexec/stream_context.cuh"
#endif

int main(int argc, char *argv[]) {
Parser parser(argc, argv);
auto nx = parser.shape_[0];
Expand All @@ -18,9 +23,13 @@ int main(int argc, char *argv[]) {
Grid<double> grid(conf);
Variable<double> variables(conf);

// Declare a CUDA stream
nvexec::stream_context stream_ctx{};
auto scheduler = stream_ctx.get_scheduler();
#if defined(ENABLE_OPENMP)
exec::static_thread_pool pool{std::thread::hardware_concurrency()};
auto scheduler = pool.get_scheduler();
#else
nvexec::stream_context stream_ctx{};
auto scheduler = stream_ctx.get_scheduler();
#endif

initialize(conf, grid, scheduler, variables);
auto start = std::chrono::high_resolution_clock::now();
Expand Down
6 changes: 6 additions & 0 deletions mini-apps/heat3d/executors/heat3D.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,12 @@ static void report_performance(const Config& conf, double seconds) {
// 9 Flop per iteration
const double GFlops = static_cast<double>(n) * static_cast<double>(conf.nbiter_) * 9 / 1.e9;

#if defined(ENABLE_OPENMP)
std::cout << "OpenMP backend with " << std::thread::hardware_concurrency() << " threads" << std::endl;
#else
std::cout << "CUDA backend" << std::endl;
#endif

std::cout << "Elapsed time: " << seconds << " [s]" << std::endl;
std::cout << "Bandwidth: " << GBytes / seconds << " [GB/s]" << std::endl;
std::cout << "Flops: " << GFlops / seconds << " [GFlops]" << std::endl;
Expand Down
9 changes: 8 additions & 1 deletion mini-apps/heat3d/executors/variable.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef __VARIABLE_HPP__
#define __VARIABLE_HPP__

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include "../types.hpp"
#include "../config.hpp"
Expand All @@ -10,7 +11,13 @@ struct Variable {
private:
using RealView3D = View3D<RealType>;
using Shape3D = shape_type<3>;
thrust::device_vector<RealType> u_, un_;
#if defined(ENABLE_OPENMP)
using Vector = thrust::host_vector<RealType>;
#else
using Vector = thrust::device_vector<RealType>;
#endif

Vector u_, un_;
Shape3D extents3D_;

public:
Expand Down
6 changes: 6 additions & 0 deletions mini-apps/heat3d/stdpar/heat3D.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,12 @@ static void report_performance(const Config& conf, double seconds) {
// 9 Flop per iteration
const double GFlops = static_cast<double>(n) * static_cast<double>(conf.nbiter_) * 9 / 1.e9;

#if defined(ENABLE_OPENMP)
std::cout << "OpenMP backend" << std::endl;
#else
std::cout << "CUDA backend" << std::endl;
#endif

std::cout << "Elapsed time: " << seconds << " [s]" << std::endl;
std::cout << "Bandwidth: " << GBytes / seconds << " [GB/s]" << std::endl;
std::cout << "Flops: " << GFlops / seconds << " [GFlops]" << std::endl;
Expand Down
7 changes: 7 additions & 0 deletions tutorial/04_stream/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
add_executable(04_stream_test stream.cpp)
target_link_libraries(04_stream_test PUBLIC Threads::Threads STDEXEC::stdexec STDEXEC::nvexec)
target_compile_features(04_stream_test PUBLIC cxx_std_20)

set(BACKEND AUTO CACHE STRING "CHOICE OF PARALLEL BACKEND")
if(BACKEND STREQUAL "OPENMP")
find_package(OpenMP REQUIRED)
target_link_libraries(04_stream_test PUBLIC OpenMP::OpenMP_CXX)
target_compile_definitions(04_stream_test PUBLIC ENABLE_OPENMP THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_OMP)
endif()
39 changes: 29 additions & 10 deletions tutorial/04_stream/stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,23 @@
#include <algorithm>
#include <numeric>
#include <stdexec/execution.hpp>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform_reduce.h>
#include <thrust/execution_policy.h>
#include <exec/static_thread_pool.hpp>
#include "nvexec/stream_context.cuh"
#include "exec/on.hpp"
#include <exec/on.hpp>
#include "stream.hpp"

using counting_iterator = thrust::counting_iterator<std::size_t>;

#if defined(ENABLE_OPENMP)
using Vector = thrust::host_vector<double>;
#else
using Vector = thrust::device_vector<double>;
#endif

constexpr std::size_t ARRAY_SIZE = 128*128*128*128;
constexpr std::size_t nbiter = 100;
constexpr double start_A = 0.1;
Expand Down Expand Up @@ -60,9 +68,9 @@ void average(const std::size_t n, UnarayOperation const unary_op, OutputType &re

template <typename RealType>
void checkSolution(const std::size_t nbiter,
const thrust::device_vector<RealType>& a,
const thrust::device_vector<RealType>& b,
const thrust::device_vector<RealType>& c,
const Vector& a,
const Vector& b,
const Vector& c,
const RealType& sum) {
// Generate correct solution
RealType gold_A = start_A;
Expand Down Expand Up @@ -128,14 +136,19 @@ void checkSolution(const std::size_t nbiter,
};

int main(int argc, char *argv[]) {
// Declare a CUDA stream
nvexec::stream_context stream_ctx{};
auto scheduler = stream_ctx.get_scheduler();
#if defined(ENABLE_OPENMP)
exec::static_thread_pool pool{std::thread::hardware_concurrency()};
auto scheduler = pool.get_scheduler();
#else
// Declare a CUDA stream
nvexec::stream_context stream_ctx{};
auto scheduler = stream_ctx.get_scheduler();
#endif

// Declare device vectors
thrust::device_vector<double> a(ARRAY_SIZE);
thrust::device_vector<double> b(ARRAY_SIZE);
thrust::device_vector<double> c(ARRAY_SIZE);
Vector a(ARRAY_SIZE);
Vector b(ARRAY_SIZE);
Vector c(ARRAY_SIZE);
double* ptr_a = (double *)thrust::raw_pointer_cast(a.data());
double* ptr_b = (double *)thrust::raw_pointer_cast(b.data());
double sum = 0.0;
Expand Down Expand Up @@ -198,6 +211,12 @@ int main(int argc, char *argv[]) {

checkSolution(nbiter, a, b, c, sum);

#if defined(ENABLE_OPENMP)
std::cout << "OpenMP backend" << std::endl;
#else
std::cout << "CUDA backend" << std::endl;
#endif

std::cout
<< "function" << csv_separator
<< "num_times" << csv_separator
Expand Down
Loading