-
Notifications
You must be signed in to change notification settings - Fork 13
/
naive.cpp
162 lines (137 loc) · 5.48 KB
/
naive.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
//==============================================================
// Matrix Multiplication: DPC++ Basic Parallel Kernel
//==============================================================
// Copyright © 2021 Intel Corporation
//
// SPDX-License-Identifier: MIT
// =============================================================
#include <sycl/sycl.hpp>
#include <ctime>
#include <chrono>
#include <getopt.h>
using namespace sycl;
int main(int argc, char *argv[]) {
size_t N = 1024;
size_t M = 16;
int VERIFY = 0;
int PRINT_OUTPUT_MATRIX = 0;
int arg;
while ((arg = getopt (argc, argv, "n:m:vp")) != -1)
switch (arg){
case 'n':
N = std::atoi(optarg);
break;
case 'm':
M = std::atoi(optarg);
break;
case 'v':
VERIFY = 1;
break;
case 'p':
PRINT_OUTPUT_MATRIX = 1;
break;
case 'h':
std::cout << std::endl;
std::cout << "Usage : ./a.out -n <MATRIX_SIZE> -m <WORK_GROUP_SIZE> -v -p\n\n";
std::cout << " [-n] size for matrix, eg: 1024\n";
std::cout << " [-m] size of work_group, eg: 8/16\n";
std::cout << " [-v] verify output with linear computation on cpu\n";
std::cout << " [-p] print output matrix\n";
std::cout << "Example : ./a.out -n 1024 -m 16 -v -p\n\n";
std::exit(0);
}
auto start = std::chrono::high_resolution_clock::now().time_since_epoch().count();
range<1> work_items { N*N};
//# Define queue with default device for offloading computation and enable profiling
//TODO
auto matrix_a=malloc_shared<float>(N*N, q);
auto matrix_b=malloc_shared<float>(N*N, q);
auto matrix_c=malloc_shared<float>(N*N, q);
auto matrix_d=malloc_shared<float>(N*N, q);
float v1 = 2.f;
float v2 = 3.f;
for (int i=0; i<N; i++)
for (int j=0; j<N; j++){
matrix_a[i*N+j] = v1++;
matrix_b[i*N+j] = v2++;
matrix_c[i*N+j] = 0.f;
matrix_d[i*N+j] = 0.f;
}
// First we warm-up the device
std::cout << "Warm-up first" << "\n";
q.submit([&](handler &h){
//# Define size for ND-Range and work-group size
range<2> global_size(N,N);
range<2> work_group_size(M,M);
//# Parallel Compute Matrix Multiplication
h.parallel_for(nd_range<2>{global_size, work_group_size}, [=](nd_item<2> item){
const int i = item.get_global_id(0);
const int j = item.get_global_id(1);
for (int k = 0; k < N; k++) {
matrix_c[i*N+j] +=matrix_a[i*N+k] * matrix_b[k*N+j];
}
});
});
q.fill(matrix_c, 0.0f, N * N).wait();
//Measure the execution time via events
std::cout << "Now the matrix-matrix multiplication." << "\n";
//Define a sycl::event
//TODO
std::cout << "\tOffload Device : " << q.get_device().get_info<info::device::name>() << "\n";
std::cout << "\tmax_work_group_size : " << q.get_device().get_info<info::device::max_work_group_size>() << "\n";
std::cout << "\tConfiguration : MATRIX_SIZE= " << N << "x" << N << "\n";
std::cout << "Now the matrix-matrix multiplication." << "\n";
//# Submit command groups to execute on device. Remember that queues member functions return sycl::events
//TODO = q.submit([&](handler &h){
//# Define size for ND-Range and work-group size
range<2> global_size(N,N);
range<2> work_group_size(M,M);
//# Parallel Compute Matrix Multiplication
h.parallel_for(nd_range<2>{global_size, work_group_size}, [=](nd_item<2> item){
const int i = item.get_global_id(0);
const int j = item.get_global_id(1);
for (int k = 0; k < N; k++) {
matrix_c[i*N+j] +=matrix_a[i*N+k] * matrix_b[k*N+j];
}
});
});
q.wait();
//Compute the kernel execution time
auto kernel_duration = //TODO
std::cout << "\tKernel Execution Time : " << kernel_duration / 1e+9 << " seconds" << "\n";
auto duration = std::chrono::high_resolution_clock::now().time_since_epoch().count() - start;
std::cout << "\tCompute Duration : " << duration / 1e+9 << " seconds\n";
//# Print Output
if (PRINT_OUTPUT_MATRIX){
for (int i=0; i<N; i++){
for (int j=0; j<N; j++){
std::cout << matrix_c[i*N+j] << " ";
}
std::cout << "\n";
}
} else {
std::cout << " [0][0] = " << matrix_c[0] << "\n";
}
//# Compute local and compare with offload computation
if (VERIFY){
int fail = 0;
for(int i=0; i<N; i++){
for (int j = 0; j < N; j++) {
matrix_d[i*N+j] =0.0f;
for(int k=0; k<N; k++){
matrix_d[i*N+j] += matrix_a[i*N+k] * matrix_b[k*N+j];
//matrix_c[i*N+j] += matrix_a[i*N+k] * matrix_b[k*N+j];
}
if(std::abs(matrix_c[i*N+j] - matrix_d[i*N+j])/matrix_c[i*N+j]>1.0e-5) {
fail = 1;
std::cout << i << " " << j << "\n";
}
}
}
if(fail == 1){
std::cout << "FAIL\n";
} else {
std::cout << "PASS\n";
}
}
}