-
Notifications
You must be signed in to change notification settings - Fork 35
/
mp8_implement.cu
155 lines (129 loc) · 6.34 KB
/
mp8_implement.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#include <wb.h>
#include <iostream>
#define SEGMENT_LENGTH 256
#define BLOCK_SIZE 256
__global__ void vecAdd(float * in1, float * in2, float * out, int len) {
//@@ Insert code to implement vector addition here
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < len){
out[idx] = in1[idx] + in2[idx];
}
}
int myMin(int a, int b){
if(a < b){
return a;
}
return b;
}
int ceil(int a, int b){
return (a + b - 1) / b;
}
int main(int argc, char ** argv) {
wbArg_t args;
int inputLength;
float * hostInput1;
float * hostInput2;
float * deviceInput1;
float * deviceInput2;
float * deviceOutput;
float* pinnedHostInput1;
float* pinnedHostInput2;
float* pinnedHostOutput;
cudaStream_t stream0, stream1, stream2, stream3;
cudaStreamCreate( &stream0);
cudaStreamCreate( &stream1);
cudaStreamCreate( &stream2);
cudaStreamCreate( &stream3);
args = wbArg_read(argc, argv);
wbTime_start(Generic, "Importing data and creating memory on host");
hostInput1 = (float *) wbImport(wbArg_getInputFile(args, 0), &inputLength);
hostInput2 = (float *) wbImport(wbArg_getInputFile(args, 1), &inputLength);
wbTime_stop(Generic, "Importing data and creating memory on host");
// allocate page-locked memory on CPU
cudaMallocHost((void **)&pinnedHostInput1, sizeof(float) * inputLength);
cudaMallocHost((void **)&pinnedHostInput2, sizeof(float) * inputLength);
cudaMallocHost((void **)&pinnedHostOutput, sizeof(float) * inputLength);
// memcpy input
memcpy(pinnedHostInput1, hostInput1, sizeof(float) * inputLength);
memcpy(pinnedHostInput2, hostInput2, sizeof(float) * inputLength);
// 1. Allocate memory on GPU
cudaMalloc((void**) &deviceInput1, sizeof(float) * 4 * SEGMENT_LENGTH);
cudaMalloc((void**) &deviceInput2, sizeof(float) * 4 * SEGMENT_LENGTH);
cudaMalloc((void**) &deviceOutput, sizeof(float) * 4 * SEGMENT_LENGTH);
dim3 DimGrid(ceil(SEGMENT_LENGTH, BLOCK_SIZE), 1, 1);
dim3 DimBlock(BLOCK_SIZE, 1, 1);
// 2. do computation, Breadth First Kernel Issue
for(int index = 0; index < inputLength; index += 4 * SEGMENT_LENGTH){
int currentPtr1 = index;
int currentPtr2 = currentPtr1 + SEGMENT_LENGTH;
int currentPtr3 = currentPtr2 + SEGMENT_LENGTH;
int currentPtr4 = currentPtr3 + SEGMENT_LENGTH;
int length1 = 0, length2 = 0, length3 = 0, length4 = 0;
// copy data
if(currentPtr1 < inputLength){
length1 = myMin(SEGMENT_LENGTH, inputLength - currentPtr1);
cudaMemcpyAsync(&deviceInput1[0], &pinnedHostInput1[currentPtr1], sizeof(float) * length1, cudaMemcpyHostToDevice, stream0);
cudaMemcpyAsync(&deviceInput2[0], &pinnedHostInput2[currentPtr1], sizeof(float) * length1, cudaMemcpyHostToDevice, stream0);
}
if(currentPtr2 < inputLength){
length2 = myMin(SEGMENT_LENGTH, inputLength - currentPtr2);
cudaMemcpyAsync(&deviceInput1[SEGMENT_LENGTH], &pinnedHostInput1[currentPtr2], sizeof(float) * length2, cudaMemcpyHostToDevice, stream1);
cudaMemcpyAsync(&deviceInput2[SEGMENT_LENGTH], &pinnedHostInput2[currentPtr2], sizeof(float) * length2, cudaMemcpyHostToDevice, stream1);
}
if(currentPtr3 < inputLength){
length3 = myMin(SEGMENT_LENGTH, inputLength - currentPtr3);
cudaMemcpyAsync(&deviceInput1[SEGMENT_LENGTH * 2], &pinnedHostInput1[currentPtr3], sizeof(float) * length3, cudaMemcpyHostToDevice, stream2);
cudaMemcpyAsync(&deviceInput2[SEGMENT_LENGTH * 2], &pinnedHostInput2[currentPtr3], sizeof(float) * length3, cudaMemcpyHostToDevice, stream2);
}
if(currentPtr4 < inputLength){
length4 = myMin(SEGMENT_LENGTH, inputLength - currentPtr4);
cudaMemcpyAsync(&deviceInput1[SEGMENT_LENGTH * 3], &pinnedHostInput1[currentPtr4], sizeof(float) * length4, cudaMemcpyHostToDevice, stream3);
cudaMemcpyAsync(&deviceInput2[SEGMENT_LENGTH * 3], &pinnedHostInput2[currentPtr4], sizeof(float) * length4, cudaMemcpyHostToDevice, stream3);
}
// do calculation
if(currentPtr1 < inputLength){
vecAdd<<<DimGrid, DimBlock, 0, stream0>>>(&deviceInput1[0], &deviceInput2[0], &deviceOutput[0], length1);
}
if(currentPtr2 < inputLength){
vecAdd<<<DimGrid, DimBlock, 0, stream1>>>(&deviceInput1[SEGMENT_LENGTH], &deviceInput2[SEGMENT_LENGTH], &deviceOutput[SEGMENT_LENGTH], length2);
}
if(currentPtr3 < inputLength){
vecAdd<<<DimGrid, DimBlock, 0, stream2>>>(&deviceInput1[SEGMENT_LENGTH * 2], &deviceInput2[SEGMENT_LENGTH * 2], &deviceOutput[SEGMENT_LENGTH * 2], length3);
}
if(currentPtr4 < inputLength){
vecAdd<<<DimGrid, DimBlock, 0, stream3>>>(&deviceInput1[SEGMENT_LENGTH * 3], &deviceInput2[SEGMENT_LENGTH * 3], &deviceOutput[SEGMENT_LENGTH * 3], length4);
}
// do memory copy from device to host
if(currentPtr1 < inputLength){
cudaMemcpyAsync(&pinnedHostOutput[currentPtr1], &deviceOutput[0], sizeof(float) * length1, cudaMemcpyDeviceToHost, stream0);
}
if(currentPtr2 < inputLength){
cudaMemcpyAsync(&pinnedHostOutput[currentPtr2], &deviceOutput[SEGMENT_LENGTH], sizeof(float) * length2, cudaMemcpyDeviceToHost, stream1);
}
if(currentPtr3 < inputLength){
cudaMemcpyAsync(&pinnedHostOutput[currentPtr3], &deviceOutput[SEGMENT_LENGTH * 2], sizeof(float) * length3, cudaMemcpyDeviceToHost, stream2);
}
if(currentPtr4 < inputLength){
cudaMemcpyAsync(&pinnedHostOutput[currentPtr4], &deviceOutput[SEGMENT_LENGTH * 3], sizeof(float) * length4, cudaMemcpyDeviceToHost, stream3);
}
}
cudaDeviceSynchronize();
std::cout<<"check hostoutput"<<std::endl;
for(int index = 0; index < myMin(10, inputLength); index++){
std::cout<<pinnedHostOutput[index]<<", ";
}
std::cout<<std::endl;
wbSolution(args, pinnedHostOutput, inputLength);
// free GPU memory
cudaFree(deviceInput1);
cudaFree(deviceInput2);
cudaFree(deviceOutput);
// free page-locked memory
cudaFreeHost(pinnedHostInput1);
cudaFreeHost(pinnedHostInput2);
cudaFreeHost(pinnedHostOutput);
// free pageable memory
free(hostInput1);
free(hostInput2);
return 0;
}