-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreduction_shared_memory.cu
85 lines (77 loc) · 2.52 KB
/
reduction_shared_memory.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#include <iostream>
#include <cstdlib>
const int Len = 50000;
const int THREADS_PER_BLOCK = 128;
__global__ void reduction(float* p_source, float* p_destination)
{
// 声明共享内存区域 大小为THREADS_PER_BLOCK个float
// __shared__ float s_d[THREADS_PER_BLOCK];
extern __shared__ float s_d[];
// 从全局内存向共享内存中拷贝数据
float* p_target = p_source + (blockIdx.x * blockDim.x + threadIdx.x);
s_d[threadIdx.x] = *p_target;
__syncthreads();
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
{
if (threadIdx.x < offset)
{
s_d[threadIdx.x] += s_d[threadIdx.x + offset];
}
__syncthreads();
}
if (threadIdx.x == 0)
{
p_destination[blockIdx.x] = s_d[0];
}
}
void _init_array(float* p)
{
float sum = 0;
for (int i = 0; i < Len; i++)
{
float tmp = 1.0 * rand() / RAND_MAX;
p[i] = tmp;
sum += tmp;
}
std::cout << "Target Sum: " << sum << std::endl;
}
void _get_gpu_result(float* p)
{
float sum = 0;
for (int i = 0; i < (Len + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; i++)
{
sum += p[i];
}
std::cout << "GPU Sum: " << sum << std::endl;
}
int main(void)
{
cudaEvent_t start, end;
cudaEventCreate(&start);
cudaEventCreate(&end);
float* h_src = (float*)malloc(Len * sizeof(float));
float* h_des = (float*)malloc((Len + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK * sizeof(float));
_init_array(h_src);
float* d_src;
float* d_des;
cudaMalloc((void**)&d_src, Len * sizeof(float));
cudaMalloc((void**)&d_des, (Len + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK * sizeof(float));
cudaMemcpy(d_src, h_src, Len * sizeof(float), cudaMemcpyHostToDevice);
cudaEventRecord(start);
// reduction<<<(Len + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_src, d_des);
reduction<<<(Len + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, THREADS_PER_BLOCK, THREADS_PER_BLOCK * sizeof(float)>>>(d_src, d_des);
cudaEventRecord(end);
cudaEventSynchronize(end);
cudaMemcpy(h_des, d_des, (Len + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK * sizeof(float), cudaMemcpyDeviceToHost);
_get_gpu_result(h_des);
cudaFree(d_des);
cudaFree(d_src);
free(h_des);
free(h_src);
float elapsed_time;
cudaEventElapsedTime(&elapsed_time, start, end);
std::cout << "Elapsed Time: " << elapsed_time << " ms" << std::endl;
cudaEventDestroy(end);
cudaEventDestroy(start);
return 0;
}