-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2d-convolution-tiling.cu
304 lines (271 loc) · 10.9 KB
/
2d-convolution-tiling.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <cuda.h>
#include <fstream>
#define BLOCK_WIDTH 32
using namespace std;
/**
* 2D convolution kernel with tiling.
* It optimizes the convolution operation by using shared memory to store the input matrix tile,
* reducing the number of global memory accesses.
*
* The kernel first loads a tile of the input matrix into shared memory,
* then performs the convolution operation within the tile.
*
* @param cuda_input_m Input matrix.
* @param mask Mask matrix.
* @param cuda_output_m Output matrix.
* @param height Height of the input matrix.
* @param width Width of the input matrix.
* @param mask_width Width of the mask matrix.
* @param N_TILE_WIDTH Tile width.
*/
__global__ void convolution_2D_tiled_kernel(
const float* cuda_input_m,
const float* __restrict__ mask,
float* cuda_output_m,
const size_t height,
const size_t width,
const size_t mask_width,
const int N_TILE_WIDTH
) {
// shared memory
__shared__ float tile_shared_memory[BLOCK_WIDTH][BLOCK_WIDTH];
// init
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int n_row = blockIdx.y * N_TILE_WIDTH + ty;
const int n_col = blockIdx.x * N_TILE_WIDTH + tx;
const int m_row = n_row - mask_width / 2;
const int m_col = n_col - mask_width / 2;
// boundary condition
if(m_row >= 0 && m_row < height && m_col >= 0 && m_col < width) {
// load element from input matrix to shared memory in the respective tile position
tile_shared_memory[ty][tx] = cuda_input_m[m_row * width + m_col];
} else {
// avoid branch divergence
tile_shared_memory[ty][tx] = 0;
}
// barrier synchronization
__syncthreads();
// boundary condition to avoid out-of-bounds access, because we calculate only N_TILE_LENGTH elements
if(ty < N_TILE_WIDTH && tx < N_TILE_WIDTH && n_row < height && n_col < width)
{
// convolution result
float convolution_result = 0;
// calculate convolution result
for(int i = 0; i < mask_width; ++i) {
for(int j = 0; j < mask_width; ++j) {
convolution_result += mask[i * mask_width + j] * tile_shared_memory[ty + i][tx + j];
}
}
// save convolution result to output matrix (barrier synchronization not needed)
cuda_output_m[n_row * width + n_col] = convolution_result;
}
}
/**
* Print a matrix.
* @param matrix Matrix to print.
* @param height Height of the matrix.
* @param width Width of the matrix.
*/
void print_matrix(const float* matrix, const int height, const int width) {
if (matrix == nullptr) {
throw invalid_argument("Matrix cannot be null");
}
for (int i = 0; i < height * width; ++i) {
printf("%f ", matrix[i]);
if (i % width == width - 1) {
printf("\n");
}
}
}
/**
* Create a matrix with random values between 0 and upper_bound.
* @param result Result matrix.
* @param rows Number of rows of the output matrix.
* @param cols Number of cols of the output matrix.
* @param lower_bound Lower limit for random number generation.
* @param upper_bound Upper limit for random number generation.
* @throw invalid_argument If mask is null.
*/
void create_random_matrix(float *result, const int rows, const int cols, const int lower_bound, const int upper_bound) {
if (lower_bound > upper_bound) {
throw invalid_argument("Lower bound cannot be greater than upper bound");
}
if (result == nullptr) {
throw invalid_argument("Result matrix cannot be null");
}
// init bound
const int boundary = rows * cols;
// insert values
for (int i = 0; i < boundary; ++i)
result[i] = ((random() % upper_bound) + lower_bound);
}
/**
* Create a constant matrix with a specific value.
* @param result Constant matrix result.
* @param rows Rows of the final matrix, it should be square.
* @param cols Cols of the final matrix, it should be square.
* @param value Value to fill the matrix.
* @throw invalid_argument If result is null.
*/
void create_constant_matrix(float *result, const int rows, const int cols, const float value) {
if (result == nullptr)
throw invalid_argument("Result matrix cannot be null");
// init bound
const int boundary = rows * cols;
// insert values
for (int i = 0; i < boundary; ++i)
result[i] = value;
}
/**
* Verify the result of the convolution operation using the CPU.
* @param matrix Input matrix.
* @param mask Mask matrix.
* @param result Result matrix to verify.
* @param height Height of the input matrix.
* @param width Width of the input matrix.
* @param mask_dim Dimension of the mask matrix.
*/
void verify_result(
const float *matrix,
const float *mask,
const float *result,
const int height,
const int width,
const int mask_dim
) {
if (matrix == nullptr || mask == nullptr || result == nullptr) {
throw invalid_argument("Matrix, mask, and result cannot be null");
}
const int mask_offset = mask_dim / 2;
for (int i = 0; i < height; i++) {
for (int j = 0; j < width; j++) {
float expected_convolution = 0.0;
for (int k = 0; k < mask_dim; k++) {
for (int l = 0; l < mask_dim; l++) {
const int r = i - mask_offset + k;
if (const int c = j - mask_offset + l; r >= 0 && r < height && c >= 0 && c < width) {
expected_convolution += matrix[r * width + c] * mask[k * mask_dim + l];
}
}
}
const float convolution = result[i * width + j];
assert(convolution == expected_convolution);
}
}
}
int main() {
// init
int mask_width = 0, matrix_width = 0, matrix_height = 0;
int seed = 0, warmup = 0;
// try to get env variable about the matrix size and mask size
try {
mask_width = stoi(getenv("MASK_SIZE"));
matrix_width = stoi(getenv("MATRIX_WIDTH"));
matrix_height = stoi(getenv("MATRIX_HEIGHT"));
if (!(mask_width > 0 && matrix_width > 0 && matrix_height > 0 && mask_width % 2 != 0)) {
throw invalid_argument("Invalid argument");
}
} catch (...) {
printf("Error reading env variable MASK_SIZE, MATRIX_HEIGHT, MATRIX_WIDTH; "
"must be an integer, the dimension must be positive and the dimension of the mask must be odd.");
return 1;
}
// try to get env variable about the seed
try {
seed = stoi(getenv("SEED"));
} catch (...) {
printf("WARNING: SEED env variable not found; random values will be generated.\n\n");
}
// try to get env variable about the warmup
try {
warmup = stoi(getenv("WARMUP"));
} catch (...) {
printf("WARNING: WARMUP env variable not set, a single run will be performed.\n\n");
}
// retrieve some info about the CUDA device
cudaGetDeviceCount(nullptr);
cudaDeviceProp prop{};
cudaGetDeviceProperties(&prop, 0);
printf("Device Number: %d\n", 0);
printf(" Device name: %s\n", prop.name);
printf(" max Blocks Per MultiProcessor: %d\n", prop.maxBlocksPerMultiProcessor);
printf(" max Threads Per MultiProcessor: %d\n", prop.maxThreadsPerMultiProcessor);
printf(" max Threads Per Block: %d\n", prop.maxThreadsPerBlock);
printf(" num SM: %d\n", prop.multiProcessorCount);
printf(" num bytes sharedMem Per Block: %lu\n", prop.sharedMemPerBlock);
printf(" num bytes sharedMem Per Multiprocessor: %lu\n", prop.sharedMemPerMultiprocessor);
printf(" Memory Clock Rate (KHz): %d\n", prop.memoryClockRate);
printf(" Memory Bus Width (bits): %d\n", prop.memoryBusWidth);
printf(" Peak Memory Bandwidth (GB/s): %f\n\n", 2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
/** Execution **/
// init
float naive_gpu_elapsed_time_ms;
const int N_TILE_WIDTH = BLOCK_WIDTH - (mask_width - 1);
float* input_m = static_cast<float *>(malloc(matrix_height * matrix_width * sizeof(float)));
float* mask = static_cast<float *>(malloc(mask_width * mask_width * sizeof(float)));
float* output_m = static_cast<float *>(malloc(matrix_width * matrix_height * sizeof(float)));
// populate
if (seed != 0) {
srand(seed);
}
create_random_matrix(input_m, matrix_height, matrix_width, 1, 100);
create_random_matrix(mask, mask_width, mask_width, 1, 5);
// =============================================== START CONVOLUTION ===============================================
// time event
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// define grid and block dimensions
dim3 dim_grid(
ceil(matrix_width / static_cast<float>(N_TILE_WIDTH)),
ceil(matrix_height / static_cast<float>(N_TILE_WIDTH))
);
dim3 dim_block(BLOCK_WIDTH, BLOCK_WIDTH);
const size_t bytes_input_m = matrix_height * matrix_width * sizeof(int);
const size_t bytes_mask = mask_width * mask_width * sizeof(int);
// allocate memory in the device
float* cuda_input_m;
float* cuda_mask;
float* cuda_output_m;
cudaMalloc(reinterpret_cast<void **>(&cuda_input_m), bytes_input_m);
cudaMalloc(reinterpret_cast<void **>(&cuda_mask), bytes_mask);
cudaMalloc(reinterpret_cast<void **>(&cuda_output_m), bytes_input_m); // same bytes as input
// initialize memory in the device
cudaMemcpy(cuda_input_m, input_m, bytes_input_m, cudaMemcpyHostToDevice);
cudaMemcpy(cuda_mask, mask, bytes_mask, cudaMemcpyHostToDevice);
// warmup
for (int i = 0; i < warmup; ++i) {
convolution_2D_tiled_kernel<<<dim_grid, dim_block>>>(
cuda_input_m, cuda_mask, cuda_output_m, matrix_height, matrix_width, mask_width, N_TILE_WIDTH
);
}
cudaEventRecord(start, nullptr);
convolution_2D_tiled_kernel<<<dim_grid, dim_block>>>(
cuda_input_m, cuda_mask, cuda_output_m, matrix_height, matrix_width, mask_width, N_TILE_WIDTH
);
cudaDeviceSynchronize();
cudaEventRecord(stop, nullptr);
cudaEventSynchronize(stop);
cudaMemcpy(output_m, cuda_output_m, bytes_input_m, cudaMemcpyDeviceToHost);
cudaEventElapsedTime(&naive_gpu_elapsed_time_ms, start, stop);
printf("Check the result of the convolution operation using the CPU...\n");
verify_result(input_m, mask, output_m, matrix_height, matrix_width, mask_width);
printf("Verification passed!\n");
// debug: print the output matrix
// print_matrix(output_m, matrix_height, matrix_width);
printf("Time elapsed on naive GPU 2D-convolution of %dx%d (block %d): %f ms.\n\n",
matrix_height, matrix_width, BLOCK_WIDTH, naive_gpu_elapsed_time_ms);
cudaFree(cuda_input_m);
cudaFree(cuda_mask);
cudaFree(cuda_output_m);
// ================================================ END CONVOLUTION ================================================
// free
free(input_m);
free(mask);
free(output_m);
return 0;
}