-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathexample.cu
277 lines (244 loc) · 8.3 KB
/
example.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cstring>
#include <ctime>
#define WARP_SIZE 32 // # of threads that are executed together (constant valid on most hardware)
/* Simple CUDA example showing:
1) how to sum the values of an array in parallel
2) how to add a scaler to values of an array in parallel
3) how to query GPU hardware
Compile with minimum archetecture specification of 30. Example:
nvcc example.cu - o example -arch=sm_30
Author: Jordan Bonilla
*/
// Allow timing of functions
clock_t start,end;
/* Add "scalar" to every element of the input array in parallel */
__global__ void
_cuda_add_scalar(int *in, int scalar, int n)
{
int globalIdx = blockIdx.x * blockDim.x + threadIdx.x;
while(globalIdx < n)
{
in[globalIdx] = in[globalIdx] + scalar;
globalIdx += blockDim.x * gridDim.x;
}
}
// CPU entry point for kernel to add "scalar" to every element of the input array
void cuda_add_scalar(int * a, int scalar, int N) {
// Get device properties to compute optimal launch bounds
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
int num_SMs = prop.multiProcessorCount;
// pad array with zeros to allow sum algorithm to work
int * d_a;
start = std::clock();
// Copy array "b" to GPU
cudaMalloc( (void**) &d_a, N * sizeof(int) );
cudaMemcpy( d_a, a, N * sizeof(int), cudaMemcpyHostToDevice );
_cuda_add_scalar<<< num_SMs, 1024 >>>(d_a, scalar, N);
// Read in results
cudaMemcpy( a, d_a, N * sizeof(int), cudaMemcpyDeviceToHost );
end = std::clock();
//verify
for(int i = 0; i < N-1; ++i) {
if(a[i] + 1.0 != a[i+1]){
printf("a[i]: %d, a[i+1]: %d\n", a[i], a[i+1]);
exit(-1);
}
}
// Clean up
cudaFree(d_a);
}
/* Use all GPU Streaming Multiprocessors to add elements in parallel.
Requies that the number of elements is a multiple of #SMs * 1024
since the algorithm processes elements in chunks of this size.
This is taken care of in "cuda_parallel_sum which pads zeros. */
__global__ void
_cuda_parallel_sum(int *in, int num_elements, int *sum)
{
//Holds intermediates in shared memory reduction
__syncthreads();
__shared__ int buffer[WARP_SIZE];
int globalIdx = blockIdx.x * blockDim.x + threadIdx.x;
int lane = threadIdx.x % WARP_SIZE;
int temp;
while(globalIdx < num_elements)
{
// All threads in a block of 1024 take an element
temp = in[globalIdx];
// All warps in this block (32) compute the sum of all
// threads in their warp
for(int delta = WARP_SIZE/2; delta > 0; delta /= 2)
{
temp+= __shfl_xor(temp, delta);
}
// Write all 32 of these partial sums to shared memory
if(lane == 0)
{
buffer[threadIdx.x / WARP_SIZE] = temp;
}
__syncthreads();
// Add the remaining 32 partial sums using a single warp
if(threadIdx.x < WARP_SIZE)
{
temp = buffer[threadIdx.x];
for(int delta = WARP_SIZE / 2; delta > 0; delta /= 2)
{
temp += __shfl_xor(temp, delta);
}
}
// Add this block's sum to the total sum
if(threadIdx.x == 0)
{
atomicAdd(sum, temp);
}
// Jump ahead 1024 * #SMs to the next region of numbers to sum
globalIdx += blockDim.x * gridDim.x;
__syncthreads();
}
}
/* CPU entry function to create intermediate variables and do array padding
and then call CUDA kernel to add all elements in an array.
*/
int cuda_parallel_sum(int * a, int N) {
// Get device properties to compute optimal launch bounds
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
int num_SMs = prop.multiProcessorCount;
start = std::clock();
// pad array with zeros to allow sum algorithm to work
int batch_size = num_SMs * 1024;
int padding = (batch_size - (N % batch_size)) % batch_size;
// b is the new padded array
int * b = new int[N + padding];
memcpy(b, a, N * sizeof(int));
memset(b + N, 0, padding * sizeof(int));
// Copy array "b" to GPU
int *d_b;
cudaMalloc( (void**) &d_b, (N + padding) * sizeof(int) );
cudaMemcpy( d_b, b, (N + padding) * sizeof(int), cudaMemcpyHostToDevice );
// Result
int result = 0.0;
int * d_result;
cudaMalloc( (void**) &d_result, sizeof(int) );
cudaMemcpy( d_result, &result, sizeof(int), cudaMemcpyHostToDevice );
// Call kernel to get sum
_cuda_parallel_sum<<< num_SMs, 1024 >>>(d_b, N + padding, d_result);
// Read in results
cudaMemcpy( &result, d_result, sizeof(int), cudaMemcpyDeviceToHost );
end = std::clock();
// Clean up
cudaFree(d_result);
cudaFree(d_b);
free(b);
return result;
}
/* Get pertienent info about connected GPUs */
void query_GPUs(int * nDevices) {
cudaGetDeviceCount(nDevices);
for (int i = 0; i < *nDevices; i++) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
printf("Device Number: %d\n", i);
printf(" Device name: %s\n", prop.name);
printf(" Memory Clock Rate (KHz): %d\n",
prop.memoryClockRate);
printf(" Memory Bus Width (bits): %d\n",
prop.memoryBusWidth);
printf(" Number of SMs: %d\n", prop.multiProcessorCount);
printf(" Peak Memory Bandwidth (GB/s): %f\n\n",
2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
}
}
// Prints an array horzontally up to size elements
void print_array(int * in, int size) {
for(int i = 0; i < size; ++i) {
printf("%d ", in[i]);
}
printf("\n");
}
int cpu_sum(int * a, int N) {
int sum = 0;
start = std::clock();
for(int i = 0; i < N; ++i) {
sum += a[i];
}
end = std::clock();
return sum;
}
void cpu_add(int * a, int scalar, int N) {
start = std::clock();
// Add one to every element of a
for(int i = 0; i < N; ++i) {
a[i] += scalar;
}
end = std::clock();
}
int main(int argc, char ** argv) {
if(argc != 2) {
printf("./example <size of array>");
exit(-1);
}
const int N = atoi(argv[1]);
printf("specified N: %d\n\n", N);
int nDevices;
// Query GPUs
query_GPUs(&nDevices);
// Select first GPU
if(nDevices != 0) {
cudaSetDevice(0);
}
else {
printf("No CUDA devices found!\n");
exit(-1);
}
// Declare CPU array
int * a = new int[N];
for(int i = 0; i < N; i++) {
a[i] = i;
}
/* Perform GPU operations
___________________________ */
printf("\n\n Computations using GPU\n\n");
// If GPU Boost is enabled, the GPU might be in "idle" mode
// so we run a kernel to get the clock speed to baseline levels.
int warmup_the_gpu_sum = cuda_parallel_sum(a, N);
// print sum before kernel launches
int gpu_sum1 = cuda_parallel_sum(a, N);
printf("sum: %d - total time: %ld\n", gpu_sum1, end - start);
// Add one to every element of a
cuda_add_scalar(a, 1.0, N);
printf("Total time to add 1 to every element: %ld\n", end - start);
//Print first 10 ints in resulting array
print_array(a, 10);
// Print new sum
int gpu_sum2 = cuda_parallel_sum(a, N);
printf("sum: %d - total time: %ld\n", gpu_sum2, end - start);
/* Perform same operations on CPU
___________________________ */
// Reset vector a
for(int i = 0; i < N; ++i) {
a[i] = i;
}
printf("\n\n Same Computations using CPU\n\n");
int cpu_sum1 = cpu_sum(a, N);
printf("sum: %d - total time: %ld\n", cpu_sum1, end - start);
cpu_add(a, 1.0, N);
printf("Total time to add 1 to every element: %ld\n", end - start);
//Print first 10 ints in resulting array
print_array(a, 10);
// Print new sum
int cpu_sum2 = cpu_sum(a, N);
printf("sum: %d - total time: %ld\n", cpu_sum2, end - start);
printf("\n\nGPU consistency test: %d\n ... %s\n", warmup_the_gpu_sum - gpu_sum1,
(warmup_the_gpu_sum - gpu_sum1 == 0.0 ? "SUCCESS" : "FAIL"));
printf("Difference in first sums: %d\n ... %s\n", gpu_sum1 - cpu_sum1,
(gpu_sum1 - cpu_sum1 == 0 ? "SUCCESS" : "FAIL"));
printf("Difference in second sums: %d\n ... %s\n", gpu_sum2 - cpu_sum2,
(gpu_sum2 - cpu_sum2 == 0 ? "SUCCESS" : "FAIL"));
// Clean up
delete(a);
return 0;
}