-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmatrix_add_sycl.cpp
71 lines (65 loc) · 3.02 KB
/
matrix_add_sycl.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#if defined(TRISYCL_OPENCL) && defined(__APPLE__)
// TODO: OpenCL header on macOSX shows some weird macro explosion
// during the expansion of the CL_DEPRECATED decorator macro that
// should be investigated further. Just elide it for now.
#define CL_SILENCE_DEPRECATION 1
#endif
#include <CL/sycl.hpp>
#include <cstddef>
#include <iostream>
using namespace cl;
// Size of the matrices
constexpr std::size_t N = 2000;
constexpr std::size_t M = 3000;
int main() {
// Create a queue to work on default device
sycl::queue q;
// Create some 2D buffers with N×M float values for our matrices
sycl::buffer<double, 2> a{{N, M}};
sycl::buffer<double, 2> b{{N, M}};
sycl::buffer<double, 2> c{{N, M}};
// First launch an asynchronous kernel to initialize buffer "a"
q.submit([&](sycl::handler &cgh) {
// The kernel writes "a", so get a write accessor to it
auto A = a.get_access<sycl::access::mode::write>(cgh);
// Enqueue parallel kernel on an N×M 2D iteration space
cgh.parallel_for<class InitAKernel>(sycl::range<2>{N, M}, [=](sycl::item<2> idx) {
A[idx] = idx[0] * 2 + idx[1];
});
});
// Launch an asynchronous kernel to initialize buffer "b"
q.submit([&](sycl::handler &cgh) {
// The kernel writes to "b", so get a write accessor on it
auto B = b.get_access<sycl::access::mode::write>(cgh);
// Enqueue a parallel kernel on an N×M 2D iteration space
cgh.parallel_for<class InitBKernel>(sycl::range<2>{N, M}, [=](sycl::item<2> idx) {
B[idx] = idx[0] * 2014 + idx[1] * 42;
});
});
// Launch an asynchronous kernel to compute matrix addition c = a + b
q.submit([&](sycl::handler &cgh) {
// In the kernel "a" and "b" are read, but "c" is written.
// Since the kernel reads "a" and "b", the runtime will implicitly add
// a producer-consumer dependency to the previous kernels producing them.
auto A = a.get_access<sycl::access::mode::read>(cgh);
auto B = b.get_access<sycl::access::mode::read>(cgh);
auto C = c.get_access<sycl::access::mode::write>(cgh);
// Enqueue a parallel kernel on an N×M 2D iteration space
cgh.parallel_for<class MatrixAddKernel>(
sycl::range<2>{N, M}, [=](sycl::item<2> idx) { C[idx] = A[idx] + B[idx]; });
});
// Request an access to read "c" from the host-side. The SYCL runtime
// will wait for "c" to be ready available on the host side before
// returning the accessor.
// This means that there is no communication happening in the nested loop below.
auto C = c.get_access<sycl::access::mode::read>();
// Correctness check
for (std::size_t i = 0; i < N; i++)
for (std::size_t j = 0; j < M; j++)
// Compare the result to the analytic value
if (C[i][j] != i * (2 + 2014) + j * (1 + 42)) {
std::cout << "Wrong value " << C[i][j] << " on element " << i << ' ' << j
<< '\n';
exit(EXIT_FAILURE);
}
}