-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathGpuUtils.cu
253 lines (223 loc) · 8.01 KB
/
GpuUtils.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
/* Shared Use License: This file is owned by Derivative Inc. (Derivative)
* and can only be used, and/or modified for use, in conjunction with
* Derivative's TouchDesigner software, and only if you are a licensee who has
* accepted Derivative's TouchDesigner license or assignment agreement
* (which also govern the use of this file). You may share or redistribute
* a modified version of this file provided the following conditions are met:
*
* 1. The shared file or redistribution must retain the information set out
* above and this list of conditions.
* 2. Derivative's name (Derivative Inc.) or its trademarks may not be used
* to endorse or promote products derived from this file without specific
* prior written permission from Derivative.
*/
#include "GpuUtils.cuh"
#include "cuda_runtime.h"
#include "cuda_fp16.h"
#include <opencv2/core/cuda.hpp>
namespace
{
void
createSurfaceObj(cudaSurfaceObject_t* sObj, cudaArray* arr)
{
cudaResourceDesc resDesc = {};
resDesc.res.array.array = arr;
resDesc.resType = cudaResourceTypeArray;
cudaCreateSurfaceObject(sObj, &resDesc);
}
__global__ void
copySurfaceToMat(cudaSurfaceObject_t src,
uchar* dst, size_t dstStep,
int width, int height)
{
// Calculate surface coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < width && y < height) {
uchar data;
// Read from input surface
surf2Dread(&data, src, x, y);
// Write to output mat data
uchar* pixel = dst + (height - y - 1) * dstStep + x;
*pixel = data;
}
}
// Assume src is in 32F and dst is 32FC2
__global__ void
copy32FSurfaceToComplexMat(cudaSurfaceObject_t src,
uchar* dst, size_t dstStep,
size_t width, size_t height, size_t xOffset, size_t pixelSize)
{
// Calculate surface coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < width && y < height) {
float2 data{};
// Read from input surface
surf2Dread(&data.x, src, x * pixelSize + xOffset, y);
// Write to output mat data
float2* pixel = reinterpret_cast<float2*>(dst + (height - y - 1) * dstStep + x * 2 * sizeof(float));
*pixel = data;
}
}
// Assume src is in 16F and dst is 32FC2
__global__ void
copy16FSurfaceToComplexMat(cudaSurfaceObject_t src,
uchar* dst, size_t dstStep,
size_t width, size_t height, size_t xOffset, size_t pixelSize)
{
// Calculate surface coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < width && y < height) {
float2 data{};
// Read from input surface
half x16bits;
surf2Dread(reinterpret_cast<uint16_t*>(&x16bits), src, x * pixelSize + xOffset, y);
data.x = x16bits;
// Write to output mat data
float2* pixel = reinterpret_cast<float2*>(dst + (height - y - 1) * dstStep + x * 2 * sizeof(float));
*pixel = data;
}
}
// Assume src is in 16U and dst is 32FC2
__global__ void
copy16USurfaceToComplexMat(cudaSurfaceObject_t src,
uchar* dst, size_t dstStep,
size_t width, size_t height, size_t xOffset, size_t pixelSize)
{
// Calculate surface coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < width && y < height) {
float2 data{};
// Read from input surface
uint16_t x16bits;
surf2Dread(&x16bits, src, x * pixelSize + xOffset, y);
data.x = x16bits * 1.0 / 65536.0f;
// Write to output mat data
float2* pixel = reinterpret_cast<float2*>(dst + (height - y - 1) * dstStep + x * 2 * sizeof(float));
*pixel = data;
}
}
// Assume src is in 8U and dst is 32FC2
__global__ void
copy8USurfaceToComplexMat(cudaSurfaceObject_t src,
uchar* dst, size_t dstStep,
size_t width, size_t height, size_t xOffset, size_t pixelSize)
{
// Calculate surface coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < width && y < height) {
float2 data{};
// Read from input surface
uint8_t x8bit;
surf2Dread(&x8bit, src, x * pixelSize + xOffset, y);
data.x = x8bit * 1.0 / 256.0f;
// Write to output mat data
float2* pixel = reinterpret_cast<float2*>(dst + (height - y - 1) * dstStep + x * 2 * sizeof(float));
*pixel = data;
}
}
// Assumes input is 32FC2 and output 32FC1
__global__ void
copyComplexMatToSurface(uchar* src,
cudaSurfaceObject_t dst, size_t srcStep,
int width, int height)
{
// Calculate surface coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < width && y < height) {
// Read from mat data
float* data = reinterpret_cast<float*>(src + (height - y - 1) * srcStep + x * 2 * sizeof(float));
// Write to surface
surf2Dwrite(*data, dst, x * sizeof(float), y);
}
}
__global__ void
copyMatToSurface(uchar* src,
cudaSurfaceObject_t dst, size_t srcStep,
int width, int height)
{
// Calculate surface coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < width && y < height) {
// Read from mat data
uchar data = *(src + (height - y - 1) * srcStep + x);
// Write to surface
surf2Dwrite(data, dst, x, y);
}
}
}
// input is float channels and output is CV_32FC2
void
GpuUtils::arrayToComplexMatGPU(int width, int height, cudaArray* input, cv::cuda::GpuMat& output, int numChannels, int channel, ChannelFormat cf)
{
// Create the output surface object
cudaSurfaceObject_t inputS{};
createSurfaceObj(&inputS, input);
dim3 blockSize(16, 16);
dim3 gridSize((width + blockSize.x - 1) / blockSize.x,
(height + blockSize.y - 1) / blockSize.y);
switch (cf)
{
case GpuUtils::ChannelFormat::U8:
copy8USurfaceToComplexMat<<<gridSize, blockSize>>>(inputS, output.data, output.step, width, height, channel * 1, numChannels * 1);
break;
case GpuUtils::ChannelFormat::U16:
copy16USurfaceToComplexMat<<<gridSize, blockSize >> >(inputS, output.data, output.step, width, height, channel * 2, numChannels * 2);
break;
case GpuUtils::ChannelFormat::F16:
copy16FSurfaceToComplexMat<<<gridSize, blockSize >> >(inputS, output.data, output.step, width, height, channel * 2, numChannels * 2);
break;
case GpuUtils::ChannelFormat::F32:
copy32FSurfaceToComplexMat<<<gridSize, blockSize >> >(inputS, output.data, output.step, width, height, channel * 4, numChannels * 4);
break;
default:
break;
}
cudaDestroySurfaceObject(inputS);
}
void
GpuUtils::complexMatGPUToArray(int width, int height, const cv::cuda::GpuMat& input, cudaArray* output)
{
// Create the output surface object
cudaSurfaceObject_t outputS{};
createSurfaceObj(&outputS, output);
dim3 blockSize(16, 16);
dim3 gridSize((width + blockSize.x - 1) / blockSize.x,
(height + blockSize.y - 1) / blockSize.y);
copyComplexMatToSurface<<<gridSize, blockSize>>>(input.data, outputS, input.step, width, height);
cudaDestroySurfaceObject(outputS);
}
void
GpuUtils::matGPUToArray(int width, int height, const cv::cuda::GpuMat& input, cudaArray* output, int pixelSize)
{
width *= pixelSize;
// Create the output surface object
cudaSurfaceObject_t outputS{};
createSurfaceObj(&outputS, output);
dim3 blockSize(16, 16);
dim3 gridSize((width + blockSize.x - 1) / blockSize.x,
(height + blockSize.y - 1) / blockSize.y);
uchar* inData = input.data;
copyMatToSurface << <gridSize, blockSize >> >(inData, outputS, input.step, width, height);
cudaDestroySurfaceObject(outputS);
}
void
GpuUtils::arrayToMatGPU(int width, int height, cudaArray* input, cv::cuda::GpuMat& output, int pixelSize)
{
width *= pixelSize;
// Create the output surface object
cudaSurfaceObject_t inputS{};
createSurfaceObj(&inputS, input);
dim3 blockSize(16, 16);
dim3 gridSize((width + blockSize.x - 1) / blockSize.x,
(height + blockSize.y - 1) / blockSize.y);
uchar* outData = output.data;
copySurfaceToMat << <gridSize, blockSize >> >(inputS, outData, output.step, width, height);
cudaDestroySurfaceObject(inputS);
}