-
Notifications
You must be signed in to change notification settings - Fork 94
/
Copy pathmatrix.cuh
501 lines (424 loc) · 14.4 KB
/
matrix.cuh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
#pragma once
#include "utils.cuh"
#include "../device/device_context.cuh"
#include "cusolverDn.h"
#include <../../../cub/cub/cub.cuh>
namespace matrix
{
using namespace h2o4gpu;
/**
* \class Matrix
*
* \brief Matrix type. Stores data internally in column major
*
*/
template <typename T>
class Matrix
{
size_t _m;
size_t _n;
T* _data;
public:
/**
* \fn Matrix()
*
* \brief Default constructor.
*
*/
Matrix() : _m(0), _n(0), _data(nullptr)
{
}
/**
* \fn Matrix(size_t m, size_t n)
*
* \brief Constructor. Initialize matrix with m rows and n columns in device memory.
*
* \param m Matrix rows.
* \param n Matrix columns.
*/
Matrix(size_t m, size_t n) : _m(m), _n(n)
{
safe_cuda(cudaMalloc(&_data, _n*_m* sizeof(T)));
}
/**
* \fn Matrix(const Matrix<T>& M)
*
* \brief Constructor. Initialise matrix by copying existing matrix.
*
* \param M The Matrix<T> to copy.
*/
Matrix(const Matrix<T>& M) : _n(M.columns()), _m(M.rows())
{
safe_cuda(cudaMalloc(&_data, _n*_m* sizeof(T)));
this->copy(M);
}
~Matrix()
{
safe_cuda(cudaFree(_data));
}
/**
* \fn void resize(size_t m, size_t n)
*
* \brief Resizes.
*
* \param m Matrix rows.
* \param n Matrix columns.
*/
void resize(size_t m, size_t n)
{
_m = m;
_n = n;
if (_data != nullptr)
{
safe_cuda(cudaFree(_data));
}
safe_cuda(cudaMalloc(&_data, _n*_m* sizeof(T)));
}
/**
* \fn T* data()
*
* \brief Return raw pointer to data. Data is allocated on device.
*
* \return Raw pointer to Matrix data.
*/
T* data()
{
return _data;
}
/**
* \fn T* data()
*
* \brief Return const raw pointer to data. Data is allocated on device.
*
* \return Raw pointer to Matrix data.
*/
const T* data() const
{
return _data;
}
/**
* \fn thrust::device_ptr<T> dptr()
*
* \brief Get thrust device pointer to matrix data. Useful for invoking thrust functions.
*
* \return A thrust::device_ptr<T>
*/
thrust::device_ptr<T> dptr()
{
return thrust::device_pointer_cast(_data);
}
/**
* \fn thrust::device_ptr<T> dptr()
*
* \brief Get const thrust device pointer to matrix data. Useful for invoking thrust functions.
*
* \return A thrust::device_ptr<T>
*/
thrust::device_ptr<const T> dptr() const
{
return thrust::device_pointer_cast(_data);
}
/**
* \fn size_t rows() const
*
* \return Number of matrix rows.
*/
size_t rows() const
{
return _m;
}
/**
* \fn size_t columns() const
*
* \return Number of matrix columns.
*/
size_t columns() const
{
return _n;
}
/**
* \fn size_t size() const
*
* \return Number of matrix elements (m*n).
*/
size_t size() const
{
return _n * _m;
}
/**
* \fn void zero()
*
* \brief Zeroes matrix elements.
*
*/
void zero()
{
thrust::fill(thrust::device_ptr<T>(_data), thrust::device_ptr<T>(_data) + _n * _m, 0);
}
/**
* \fn void fill(T val)
*
* \brief Fills matrix with given value.
*
* \param val The value.
*/
void fill(T val)
{
thrust::fill(thrust::device_ptr<T>(_data), thrust::device_ptr<T>(_data) + _n * _m, val);
}
/**
* \fn void random(int random_seed = 0)
*
* \brief Fills matrix elements with uniformly distributed numbers between 0-1.0
*
* \param random_seed (Optional) The random seed.
*/
void random(int random_seed = 0)
{
auto counting = thrust::make_counting_iterator(0);
thrust::transform(counting, counting + _m * _n,
thrust::device_ptr<T>(_data),
[=]__device__(int idx)
{
thrust::default_random_engine randEng(random_seed);
thrust::uniform_real_distribution<T> uniDist;
randEng.discard(idx);
return uniDist(randEng);
}
);
}
/**
* \fn void random_normal(int random_seed = 0)
*
* \brief Fill matrix with normally distributed random numbers between zero and one.
*
* \param random_seed (Optional) The random seed.
*/
void random_normal(int random_seed = 0)
{
auto counting = thrust::make_counting_iterator(0);
thrust::transform(counting, counting + _m * _n,
thrust::device_ptr<T>(_data),
[=]__device__(int idx)
{
thrust::default_random_engine randEng(random_seed);
thrust::normal_distribution<T> dist;
randEng.discard(idx);
return dist(randEng);
}
);
}
/**
* \fn void copy(const T*hptr)
*
* \brief Copies from host pointer to matrix. Assumes host pointer contains array of same size as matrix.
*
* \param hptr Host pointer.
*/
template <typename HostT>
void copy(const HostT* hptr)
{
if(std::is_same<HostT, T>::value){
thrust::copy(hptr, hptr + this->size(), this->dptr());
}else{
std::vector<T> temp(hptr, hptr + this->size());
thrust::copy(temp.begin(), temp.end(), this->dptr());
}
}
/**
* \fn void copy(const Matrix<T>& M)
*
* \brief Copies the given M.
*
* \param M The Matrix<T> to process.
*/
void copy(const Matrix<T>& M)
{
h2o4gpu_check(M.rows() == this->rows()&&M.columns() == this->columns(), "Cannot copy matrix. Dimensions are different.");
thrust::copy(M.dptr(), M.dptr() + M.size(), this->dptr());
}
void print() const
{
thrust::host_vector<T> h_matrix(thrust::device_ptr<T>(_data), thrust::device_ptr<T>(_data + _n * _m));
for (auto i = 0; i < _m; i++)
{
for (auto j = 0; j < _n; j++)
{
printf("%1.2f ", h_matrix[j * _m + i]);
}
printf("\n");
}
}
template<typename function_t>
void transform(function_t f)
{
thrust::transform(this->dptr(), this->dptr() + this->size(), this->dptr(), f);
}
//Copy contents of matrix to host pointer
template<typename host_ptr_t>
void copy_to_host(host_ptr_t ptr){
thrust::copy(this->dptr(), this->dptr() + this->size(), ptr);
}
};
void multiply_diag(const Matrix<float>& A, const Matrix<float>& B, Matrix<float>& C, device::DeviceContext& context, bool left_diag);
void multiply_diag(const Matrix<double>& A, const Matrix<double>& B, Matrix<double>& C, device::DeviceContext& context, bool left_diag);
/**
* \fn void multiply(const Matrix<float>& A, const Matrix<float>& B, Matrix<float>& C, device::DeviceContext& context, bool transpose_a = false, bool transpose_b = false, float alpha=1.0f);
*
* \brief Matrix multiplication. ABa = C. A or B may be transposed. a is a scalar.
*
* \param A The Matrix<float> to process.
* \param B The Matrix<float> to process.
* \param [in,out] C The Matrix<float> to process.
* \param [in,out] context The context.
* \param transpose_a (Optional) True to transpose a.
* \param transpose_b (Optional) True to transpose b.
* \param alpha (Optional) The alpha.
*/
void multiply(const Matrix<float>& A, const Matrix<float>& B, Matrix<float>& C, device::DeviceContext& context, bool transpose_a = false, bool transpose_b = false, float alpha = 1.0f);
/**
* \fn void multiply(const Matrix<double>& A, const Matrix<double>& B, Matrix<double>& C, device::DeviceContext& context, bool transpose_a = false, bool transpose_b = false, double alpha=1.0f);
*
* \brief Matrix multiplication. ABa = C. A or B may be transposed. a is a scalar.
*
* \param A The Matrix<float> to process.
* \param B The Matrix<double> to process.
* \param [in,out] C The Matrix<float> to process.
* \param [in,out] context The context.
* \param transpose_a (Optional) True to transpose a.
* \param transpose_b (Optional) True to transpose b.
* \param alpha (Optional) The alpha.
*/
void multiply(const Matrix<double>& A, const Matrix<double>& B, Matrix<double>& C, device::DeviceContext& context, bool transpose_a = false, bool transpose_b = false, double alpha = 1.0f);
/**
* \fn void multiply(Matrix<float>& A, const float a ,device::DeviceContext& context);
*
* \brief Matrix scalar multiplication.
*
* \param [in,out] A The Matrix<float> to process.
* \param a The scalar.
* \param [in,out] context The context.
*/
template<typename T, typename U>
void multiply(Matrix<T>& A, const U a, device::DeviceContext& context);
/**
* \fn void matrix_sub(const Matrix<float>& A, const Matrix<float>& B, Matrix<float>& C, device::DeviceContext& context)
*
* \brief Matrix subtraction. A - B = C.
*
*/
template<typename T>
void subtract(const Matrix<T>& A, const Matrix<T>& B, Matrix<T>& C, device::DeviceContext& context);
/**
* \fn void add(const Matrix<float>& A, const Matrix<float>& B, Matrix<float>& C, device::DeviceContext& context);
*
* \brief Matrix addition. A + B = C
*
* \param A The Matrix<float> to process.
* \param B The Matrix<float> to process.
* \param [in,out] C The Matrix<float> to process.
* \param [in,out] context The context.
*/
template<typename T>
void add(const Matrix<T>& A, const Matrix<T>& B, Matrix<T>& C, device::DeviceContext& context);
/**
* \fn void transpose(const Matrix<float >&A, Matrix<float >&B, device::DeviceContext& context)
*
* \brief Transposes matrix A into matrix B.
*
* \param A The Matrix<float> to process.
* \param [in,out] B The Matrix<float> to process.
* \param [in,out] context The context.
*/
void transpose(const Matrix<float>& A, Matrix<float>& B, device::DeviceContext& context);
/**
* \fn void transpose(const Matrix<double >&A, Matrix<double >&B, device::DeviceContext& context)
*
* \brief Transposes matrix A into matrix B.
*
* \param A The Matrix<double> to process.
* \param [in,out] B The Matrix<double> to process.
* \param [in,out] context The context.
*/
void transpose(const Matrix<double>& A, Matrix<double>& B, device::DeviceContext& context);
/**
* \fn void normalize_columns(Matrix<float>& M, Matrix<float>& M_temp, Matrix<float>& column_length, Matrix<float>& ones, device::DeviceContext& context);
*
* \brief Normalize matrix columns.
*
* \param [in,out] M The Matrix<float> to process.
* \param [in,out] M_temp Temporary storage matrix of size >= M.
* \param [in,out] column_length Temporary storage matrix with one element per column.
* \param [in,out] ones Matrix of ones of length M.columns().
* \param [in,out] context The context.
*/
void normalize_columns(Matrix<float>& M, Matrix<float>& M_temp, Matrix<float>& column_length, const Matrix<float>& ones, device::DeviceContext& context);
void normalize_columns(Matrix<double>& M, Matrix<double>& M_temp, Matrix<double>& column_length, const Matrix<double>& ones, device::DeviceContext& context);
void normalize_columns(Matrix<float>& M, device::DeviceContext& context);
void normalize_columns(Matrix<double>& M, device::DeviceContext& context);
/**
* \fn void normalize_vector_cublas(Matrix<float>& M, device::DeviceContext& context)
*
* \brief Normalize a vector utilizing cuBLAS
*
* \param [in,out] M The vector to process
* \param [in,out] context Device context.
*/
void normalize_vector_cublas(Matrix<float>& M, device::DeviceContext& context);
/**
* \fn void normalize_vector_cublas(Matrix<double>& M, device::DeviceContext& context)
*
* \brief Normalize a vector utilizing cuBLAS
*
* \param [in,out] M The vector to process
* \param [in,out] context Device context.
*/
void normalize_vector_cublas(Matrix<double>& M, device::DeviceContext& context);
/**
* \fn void normalize_vector_thrust(Matrix<float>& M, device::DeviceContext& context)
*
* \brief Normalize a vector utilizng Thrust
*
* \param [in,out] M The vector to process
* \param [in,out] context Device context.
*/
template<typename T>
void normalize_vector_thrust(Matrix<T>& M, device::DeviceContext& context);
/**
* \fn void residual(const Matrix<float >&X, const Matrix<float >&D, const Matrix<float >&S, Matrix<float >&R, device::DeviceContext & context);
*
* \brief Calculate residual R = X - DS
*
*/
void residual(const Matrix<float>& X, const Matrix<float>& D, const Matrix<float>& S, Matrix<float>& R, device::DeviceContext& context);
void residual(const Matrix<double>& X, const Matrix<double>& D, const Matrix<double>& S, Matrix<double>& R, device::DeviceContext& context);
void calculate_eigen_pairs_exact(const Matrix<float>& X, Matrix<float>& Q, Matrix<float>& w, device::DeviceContext& context);
void calculate_eigen_pairs_exact(const Matrix<double>& X, Matrix<double>& Q, Matrix<double>& w, device::DeviceContext& context);
void dot_product(Matrix<float>& b_k1, Matrix<float>& b_k, float* eigen_value_estimate, device::DeviceContext& context);
void dot_product(Matrix<double>& b_k1, Matrix<double>& b_k, double* eigen_value_estimate, device::DeviceContext& context);
void max_index_per_column(Matrix<float>& A, std::vector<int>& result_array, device::DeviceContext& context);
void max_index_per_column(Matrix<double>& A, std::vector<int>& result_array, device::DeviceContext& context);
//----------------------------------------------------------------------------------------------------------------------------------------------------------------------
//Stricly floating point operations that are not used
/**
* \fn void linear_solve(const Matrix<float>& A, Matrix<float>& X, const Matrix<float>& B, device::DeviceContext& context)
*
* \brief Solve linear system AX=B to find B.
*
* \param A The Matrix<float> to process.
* \param [in,out] X The Matrix<float> to process.
* \param B The Matrix<float> to process.
* \param [in,out] context The context.
*/
void linear_solve(const Matrix<float>& A, Matrix<float>& X, const Matrix<float>& B, device::DeviceContext& context);
/**
* \fn void pseudoinverse(const Matrix<float>& A, Matrix<float>& pinvA, device::DeviceContext& context)
*
* \brief Calculate Moore-Penrose seudoinverse using the singular value decomposition method.
*
* \param A Input matrix.
* \param [in,out] pinvA The pseudoinverse out.
* \param [in,out] context Device context.
*/
void pseudoinverse(const Matrix<float>& A, Matrix<float>& pinvA, device::DeviceContext& context);
}