forked from apache/mxnet
-
Notifications
You must be signed in to change notification settings - Fork 2
/
sparse_attempt.cc
218 lines (212 loc) · 9.88 KB
/
sparse_attempt.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
// Transpose the grad_in here since the dst of csrmm will be transposed
Tensor<xpu, 2, DType> grad_in = in_grad[embedding::kWeight].get_with_shape<xpu, 2, DType>(
Shape2(param_.output_dim, param_.input_dim), s);
// Parse the data into csr format using tempspace
index_t word_num = static_cast<index_t>(param_.input_dim);
Tensor<xpu, 1, int> workspace = ctx.requested[embedding::kTempSpace]
.get_space_typed<xpu, 1, int>(
Shape1(data.shape_.Size() * 2 + 1), s);
Tensor<xpu, 1, int> data_col_ind = Tensor<xpu, 1, int>(workspace.dptr_,
Shape1(data.shape_.Size()),
data.shape_.Size(),
s);
Tensor<xpu, 1, int> data_row_ptr = Tensor<xpu, 1, int>(workspace.dptr_ + data.shape_.Size(),
Shape1(data.shape_.Size() + 1),
word_num + 1,
s);
data_row_ptr = range<int>(0, data.shape_.Size() + 1);
data_col_ind = tcast<int>(data);
data = scalar<DType>(1.0f);
LOG(INFO) << "embedding: kWriteTo";
// grad_in = scalar<DType>(0.0f);
// AddTakeGrad(grad_in, data, data_row_ptr, data_col_ind, grad_out);
csrmm<true, false>(data, data_row_ptr, data_col_ind, grad_out,
DType(1.0f), DType(0.0f), grad_in);
data = tcast<DType>(data_col_ind);
/*! \brief cusparse handle */
cusparseHandle_t sparse_handle_;
/*! \brief cusparse handle ownership */
HandleState sparse_handle_ownership_;
st->CreateSparseHandle();
stream->DestorySparseHandle();
/*!
* \brief return actual cusparseHandle
* \param pointer to GPU stream
*/
inline static cusparseHandle_t GetSparseHandle(Stream<gpu> *stream) {
if (stream == NULL) {
return 0;
}
else {
CHECK_NE(stream->sparse_handle_ownership_, NoHandle)
<< "No handle exist in source stream";
return stream->sparse_handle_;
}
}
/*! \brief Destory cusparse handle if own it */
inline void DestorySparseHandle() {
if (sparse_handle_ownership_ == OwnHandle) {
cusparseStatus_t err = cusparseDestroy(sparse_handle_);
sparse_handle_ownership_ = NoHandle;
CHECK_EQ(err, CUSPARSE_STATUS_SUCCESS) << "Destory cusparse handle failed";
}
}
/*! \brief Destory original sparse handle and create a new one */
inline void CreateSparseHandle() {
this->DestorySparseHandle();
cusparseStatus_t err = cusparseCreate(&sparse_handle_);
sparse_handle_ownership_ = OwnHandle;
CHECK_EQ(err, CUSPARSE_STATUS_SUCCESS) << "Create cusparse handle failed";
}
/*!
* \brief CPU/GPU: dst^T = alpha * op(A) * op(B) + beta * dst^T;
op(A) = trans_A ? A^T : A; op(B) = trans_B ? B^T : B;
A is stored in Compressed Sparse Row Format (CSR)
Refer to https://en.wikipedia.org/wiki/Sparse_matrix
* \param A_val value of the nnz elements in A
* \param A_row_ptr pointer to row elements
* \param A_col_ind column indices of the nnz elements
* \param B dense matrix
* \param alpha
* \param beta
* \param dst destination, during computation, the dst will be transposed
*/
template<bool trans_A, bool trans_B, typename DType>
inline void csrmm(const Tensor<cpu, 1, DType> &A_val,
const Tensor<cpu, 1, int> &A_row_ptr,
const Tensor<cpu, 1, int> &A_col_ind,
const Tensor<cpu, 2, DType> &B,
DType alpha,
DType beta,
Tensor<cpu, 2, DType> dst);
/*!
* \brief CPU/GPU: dst = alpha * op(A) * op(B) + beta * dst;
op(A) = trans_A ? A^T : A; op(B) = trans_B ? B^T : B;
A is stored in Compressed Sparse Row Format (CSR)
Refer to https://en.wikipedia.org/wiki/Sparse_matrix
* \param A_val value of the nnz elements in A
* \param A_row_ptr pointer to row elements
* \param A_col_ind column indices of the nnz elements
* \param B dense matrix
* \param alpha
* \param beta
* \param dst destination, during computation, the dst will be transposed
*/
template<bool trans_A, bool trans_B, typename DType>
inline void csrmm(const Tensor<gpu, 1, DType> &A_val,
const Tensor<gpu, 1, int> &A_row_ptr,
const Tensor<gpu, 1, int> &A_col_ind,
const Tensor<gpu, 2, DType> &B,
DType alpha,
DType beta,
Tensor<gpu, 2, DType> dst);
// Follows http://www.netlib.org/utk/people/JackDongarra/etemplates/node382.html
template<bool trans_A, bool trans_B, typename DType>
inline void csrmm(const Tensor<cpu, 1, DType> &A_val,
const Tensor<cpu, 1, int> &A_row_ptr,
const Tensor<cpu, 1, int> &A_col_ind,
const Tensor<cpu, 2, DType> &B,
DType alpha,
DType beta,
Tensor<cpu, 2, DType> dst) {
using namespace mshadow::expr;
int m = A_row_ptr.shape_.Size() - 1;
int k = trans_A ? dst.size(1) : (trans_B ? B.size(1) : B.size(0));
int n = trans_B ? B.size(0) : B.size(1);
int ldb = B.size(1);
int nnz = A_val.shape_.Size();
int ldc = dst.size(1);
LOG(INFO) << "m = " << m << ", k = " << k << ", n = " << n << ", nnz = " << nnz;
LOG(INFO) << "Shape: A_val = " << A_val.shape_ <<" A_row_ptr = " << A_row_ptr.shape_ << " A_col_ind = " << A_col_ind.shape_ << " B = " << B.shape_ << " dst = " << dst.shape_;
CHECK((dst.size(1) == (trans_A ? k : m)) && (dst.size(0) == n)) << "Shape error,"
<< " need dst = (" << n << ", " << (trans_A ? k : m) << "),"
<< " get " << dst.shape_;
dst *= ScalarExp<DType>(beta);
if (trans_A) {
for (index_t j = 0; j < m; ++j) {
for (index_t k = 0; k < n; ++k) {
for (index_t i = A_row_ptr[j]; i < A_row_ptr[j + 1]; ++i) {
dst[k][A_col_ind[i]] += alpha * A_val[i] * (trans_B ? B[k][j] : B[j][k]);
}
}
}
} else {
for (index_t j = 0; j < m; j++) {
for (index_t k = 0; k < n; k++) {
for (index_t i = A_row_ptr[j]; i < A_row_ptr[j + 1]; ++i) {
dst[k][i] += alpha * A_val[j] * (trans_B ? B[k][A_col_ind[i]] : B[A_col_ind[i]][k]);
}
}
}
}
}
// Use cuSPARSE
template<bool trans_A, bool trans_B>
inline void csrmm(const Tensor<gpu, 1, float> &A_val,
const Tensor<gpu, 1, int> &A_row_ptr,
const Tensor<gpu, 1, int> &A_col_ind,
const Tensor<gpu, 2, float> &B,
float alpha,
float beta,
Tensor<gpu, 2, float> dst) {
CHECK_EQ(A_val.CheckContiguous(), true);
CHECK_EQ(A_row_ptr.CheckContiguous(), true);
CHECK_EQ(A_col_ind.CheckContiguous(), true);
CHECK_EQ(B.CheckContiguous(), true);
int m = A_row_ptr.shape_.Size() - 1;
int k = trans_A ? dst.size(1) : (trans_B ? B.size(1) : B.size(0));
int n = trans_B ? B.size(0) : B.size(1);
int ldb = B.size(1);
int nnz = A_val.shape_.Size();
int ldc = dst.size(1);
LOG(INFO) << "m = " << m << ", k = " << k << ", n = " << n << ", nnz = " << nnz;
LOG(INFO) << "Shape: A_val = " << A_val.shape_ << " A_row_ptr = " << A_row_ptr.shape_ << " A_col_ind = " << A_col_ind.shape_ << " B = " << B.shape_ << " dst = " << dst.shape_;
CHECK((dst.size(1) == (trans_A ? k : m)) && (dst.size(0) == n)) << "Shape error,"
<< " need dst = (" << n << ", " << (trans_A ? k : m) << "),"
<< " get " << dst.shape_;
CHECK_EQ(A_col_ind.shape_.Size(), nnz) << "col_ind shape: " << A_col_ind.shape_
<< "val shape:" << A_val.shape_;
cusparseStatus_t err = cusparseSetStream(Stream<gpu>::GetSparseHandle(dst.stream_),
Stream<gpu>::GetStream(dst.stream_));
CHECK_EQ(err, CUSPARSE_STATUS_SUCCESS) << "cusparse: set stream failed";
cusparseMatDescr_t descr = NULL;
CHECK_EQ(cusparseCreateMatDescr(&descr), CUSPARSE_STATUS_SUCCESS) << "cusparse: create descr";
CHECK_EQ(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL), CUSPARSE_STATUS_SUCCESS)
<< "cusparse: set mat type failed";
CHECK_EQ(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO), CUSPARSE_STATUS_SUCCESS)
<< "cusparse: set mat index failed";
err = cusparseScsrmm2(Stream<gpu>::GetSparseHandle(dst.stream_),
trans_A ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE,
trans_B ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE,
m, n, k, nnz, &alpha, descr,
(const float*)A_val.dptr_,
(const int*)A_row_ptr.dptr_,
(const int*)A_col_ind.dptr_,
(const float*)B.dptr_, ldb, &beta, dst.dptr_, ldc);
CHECK_EQ(err, CUSPARSE_STATUS_SUCCESS) << "cusparse: csrmm fail";
CHECK_EQ(cusparseDestroyMatDescr(descr), CUSPARSE_INDEX_BASE_ZERO)
<< "cusparse: destroy failed!";
descr = NULL;
}
// Use cuSPARSE
template<bool trans_A, bool trans_B>
inline void csrmm(const Tensor<gpu, 1, double> &A_val,
const Tensor<gpu, 1, int> &A_row_ptr,
const Tensor<gpu, 1, int> &A_col_ind,
const Tensor<gpu, 2, double> &B,
double alpha,
double beta,
Tensor<gpu, 2, double> dst) {
LOG(FATAL) << "Not implmented!";
}
// Use cuSPARSE
template<bool trans_A, bool trans_B>
inline void csrmm(const Tensor<gpu, 1, half::half_t> &A_val,
const Tensor<gpu, 1, int> &A_row_ptr,
const Tensor<gpu, 1, int> &A_col_ind,
const Tensor<gpu, 2, half::half_t> &B,
half::half_t alpha,
half::half_t beta,
Tensor<gpu, 2, half::half_t> dst) {
LOG(FATAL) << "Not implmented!";
}