-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcuda_stream.cu
145 lines (128 loc) · 3.93 KB
/
cuda_stream.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#include <stdio.h>
#include <stdlib.h>
#define M 32
#define D 3
#define N 32
void printMatrix(int* p, int row, int column)
{
for (int i = 0; i < row; i++)
{
printf("[ ");
for (int j = 0; j < column; j++)
{
if (j != column - 1)
{
printf("%d, ", *(p + i * column + j));
}
else
{
printf("%d ]\n", *(p + i * column + j));
}
}
}
}
__global__ void matmul_advance(int* p_a, int* p_b, int* p_c)
{
// 创建线程块内部共享内存缓冲区
__shared__ int buffer[16][16][D];
// 线程块坐标(blockIdx.x, blockIdx.y)
// 线程坐标(threadIdx.x, threadIdx.y, threadIdx.z)
int offset_row = blockIdx.x * blockDim.x + threadIdx.x;
int offset_column = blockIdx.y * blockDim.y + threadIdx.y;
int tmp1 = *(p_a + offset_row * D + threadIdx.z);
int tmp2 = *(p_b + offset_column + threadIdx.z * N);
buffer[threadIdx.x][threadIdx.y][threadIdx.z] = tmp1 * tmp2;
__syncthreads();
int i = D;
while (i != 1)
{
int half = i / 2;
int odd = i % 2;
if (threadIdx.z < half)
{
buffer[threadIdx.x][threadIdx.y][threadIdx.z] += buffer[threadIdx.x][threadIdx.y][threadIdx.z + half];
}
if (odd == 1 && threadIdx.z == 0)
{
buffer[threadIdx.x][threadIdx.y][0] += buffer[threadIdx.x][threadIdx.y][half];
}
__syncthreads();
i = half;
}
if (threadIdx.z == 0)
{
*(p_c + offset_row * N + offset_column) = buffer[threadIdx.x][threadIdx.y][0];
}
}
int main(void)
{
int* host_a;
int* host_b;
int* host_c;
int* dev_a;
int* dev_b;
int* dev_c;
// 主机上存储原始矩阵的内存分配为页锁定内存
cudaHostAlloc((void**)&host_a, M * D * sizeof(int), cudaHostAllocDefault);
cudaHostAlloc((void**)&host_b, D * N * sizeof(int), cudaHostAllocDefault);
cudaHostAlloc((void**)&host_c, M * N * sizeof(int), cudaHostAllocDefault);
cudaMalloc((void**)&dev_a, M * D * sizeof(int));
cudaMalloc((void**)&dev_b, D * N * sizeof(int));
cudaMalloc((void**)&dev_c, M * N * sizeof(int));
// 初始化两个输入矩阵
for (int i = 0; i < M; i++)
{
for (int j = 0; j < D; j++)
{
*(host_a + i * D + j) = 1;
}
}
for (int i = 0; i < D; i++)
{
for (int j = 0; j < N; j++)
{
*(host_b + i * N + j) = 1;
}
}
printf("Matrix A:\n");
printMatrix(host_a, M, D);
printf("Matrix B:\n");
printMatrix(host_b, D, N);
// 创建CUDA流
cudaStream_t stream;
cudaStreamCreate(&stream);
// 初始化并启动计时器
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
dim3 grid(M / 16, N / 16);
// D的最大值为4
dim3 block(16, 16, D);
// 将两个输入矩阵copy到GPU上(以异步的方式)
cudaMemcpyAsync(dev_a, host_a, M * D * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(dev_b, host_b, D * N * sizeof(int), cudaMemcpyHostToDevice, stream);
// 做矩阵乘法
matmul_advance<<<grid, block, 0, stream>>>(dev_a, dev_b, dev_c);
// 将计算结果拷贝回主机页锁定内存
cudaMemcpyAsync(host_c, dev_c, M * N *sizeof(int), cudaMemcpyDeviceToHost, stream);
// 计算运行时间
cudaStreamSynchronize(stream);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
printf( "Time to generate: %3.1f ms\n", elapsedTime );
printf("Matrix C:\n");
printMatrix(host_c, M, N);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaStreamDestroy(stream);
cudaFree(dev_c);
cudaFree(dev_b);
cudaFree(dev_a);
cudaFreeHost(host_a);
cudaFreeHost(host_b);
cudaFreeHost(host_c);
return 0;
}