forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
batch_sparse_to_dense_op.cu
146 lines (128 loc) · 3.97 KB
/
batch_sparse_to_dense_op.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#include "caffe2/operators/batch_sparse_to_dense_op.h"
#include <cub/device/device_scan.cuh>
#include "caffe2/core/context_gpu.h"
namespace caffe2 {
namespace {
void array_prefix_sum_inclusive(
const int64_t* dev_array,
const int num_items,
Tensor& prefix_buffer,
Tensor& prefix_sum,
CUDAContext& context) {
// Retrieve buffer size
size_t temp_storage_bytes = 0;
prefix_sum.Resize(num_items);
cub::DeviceScan::InclusiveSum(
nullptr,
temp_storage_bytes,
dev_array,
prefix_sum.mutable_data<int64_t>(),
num_items,
context.cuda_stream());
// Allocate temporary storage
auto buffer_size = (temp_storage_bytes + sizeof(int64_t)) / sizeof(int64_t);
prefix_buffer.Resize(buffer_size);
void* dev_temp_storage =
static_cast<void*>(prefix_buffer.mutable_data<int64_t>());
// Inclusive sum
cub::DeviceScan::InclusiveSum(
dev_temp_storage,
temp_storage_bytes,
dev_array,
prefix_sum.mutable_data<int64_t>(),
num_items,
context.cuda_stream());
}
__global__ void FillInDenseValuesKernel(
const int64_t batch_size,
const int64_t dense_last_dim,
const int64_t* indices_data,
const float* values_data,
const int64_t* L_cum_sum_data,
float* output_data) {
CUDA_1D_KERNEL_LOOP(idx, batch_size) {
int offset_start = idx == 0 ? 0 : L_cum_sum_data[idx - 1];
int offset_end = L_cum_sum_data[idx];
for (int q = offset_start; q < offset_end; q++) {
int indice = indices_data[q];
float val = values_data[q];
output_data[idx * dense_last_dim + indice] = val;
}
}
}
__global__ void FillInSparseValuesKernel(
const int64_t batch_size,
const int64_t dense_last_dim,
const int64_t* indices_data,
const float* dense_data,
const int64_t* L_cum_sum_data,
float* output_data) {
CUDA_1D_KERNEL_LOOP(idx, batch_size) {
int offset_start = idx == 0 ? 0 : L_cum_sum_data[idx - 1];
int offset_end = L_cum_sum_data[idx];
for (int q = offset_start; q < offset_end; q++) {
int indice = indices_data[q];
output_data[q] = dense_data[idx * dense_last_dim + indice];
}
}
}
} // namespace
template <>
void BatchSparseToDenseOp<float, CUDAContext>::FillInDenseValues(
const int64_t batch_size,
const int64_t indice_lengths,
const int64_t* lengths_data,
const int64_t* indices_data,
const float* values_data,
float* output_data,
CUDAContext* context) {
// calculate the prefix sum of the length array
array_prefix_sum_inclusive(
lengths_data, batch_size, len_prefix_tmp_, len_prefix_sum_, context_);
// launch the gpu kernel for to fill in dense values
const int64_t min_size = 1;
FillInDenseValuesKernel<<<
CAFFE_GET_BLOCKS(std::max(batch_size, min_size)),
CAFFE_CUDA_NUM_THREADS,
0,
context->cuda_stream()>>>(
batch_size,
dense_last_dim_,
indices_data,
values_data,
len_prefix_sum_.data<int64_t>(),
output_data);
}
template <>
void BatchDenseToSparseOp<float, CUDAContext>::FillInSparseValues(
const int64_t batch_size,
const int64_t indice_lengths,
const int64_t* lengths_data,
const int64_t* indices_data,
const float* dense_data,
float* output_data,
CUDAContext* context) {
// calculate the prefix sum of the length array
array_prefix_sum_inclusive(
lengths_data, batch_size, len_prefix_tmp_, len_prefix_sum_, context_);
// launch the gpu kernel for to fill in sparse values
const int64_t min_size = 1;
FillInSparseValuesKernel<<<
CAFFE_GET_BLOCKS(std::max(batch_size, min_size)),
CAFFE_CUDA_NUM_THREADS,
0,
context->cuda_stream()>>>(
batch_size,
dense_last_dim_,
indices_data,
dense_data,
len_prefix_sum_.data<int64_t>(),
output_data);
}
REGISTER_CUDA_OPERATOR(
BatchSparseToDense,
BatchSparseToDenseOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(
BatchDenseToSparse,
BatchDenseToSparseOp<float, CUDAContext>);
} // namespace caffe2