-
Notifications
You must be signed in to change notification settings - Fork 94
/
Copy pathutils.cuh
369 lines (309 loc) · 7.57 KB
/
utils.cuh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
#pragma once
#include "cublas_v2.h"
#include <thrust/device_vector.h>
#include <thrust/random.h>
#include <thrust/transform.h>
#include <sstream>
#include <thrust/system/cuda/error.h>
#include <cusolver_common.h>
#include <ctime>
#include <cusparse.h>
namespace h2o4gpu
{
#define h2o4gpu_error(x) error(x, __FILE__, __LINE__);
inline void error(const char* e, const char* file, int line)
{
std::stringstream ss;
ss << e << " - " << file << "(" << line << ")";
//throw error_text;
std::cerr << ss.str() << std::endl;
exit(-1);
}
#define h2o4gpu_check(condition, msg) check(condition, msg, __FILE__, __LINE__);
inline void check(bool val, const char* e, const char* file, int line)
{
if (!val)
{
error(e, file, line);
}
}
#define safe_cuda(ans) throw_on_cuda_error((ans), __FILE__, __LINE__)
inline cudaError_t throw_on_cuda_error(cudaError_t code, const char* file, int line)
{
if (code != cudaSuccess)
{
std::stringstream ss;
ss << cudaGetErrorString(code) << " - " << file << "(" << line << ")";
//throw error_text;
std::cerr << ss.str() << std::endl;
exit(-1);
}
return code;
}
inline static const char* cublasGetErrorEnum(cublasStatus_t error)
{
switch (error)
{
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
}
return "<unknown>";
}
#define safe_cublas(ans) throw_on_cublas_error((ans), __FILE__, __LINE__)
inline cublasStatus_t throw_on_cublas_error(cublasStatus_t status, const char* file, int line)
{
if (status != CUBLAS_STATUS_SUCCESS)
{
std::stringstream ss;
ss << cublasGetErrorEnum(status) << " - " << file << "(" << line << ")";
std::string error_text;
ss >> error_text;
//throw error_text;
std::cerr << error_text << std::endl;
exit(-1);
}
return status;
}
#define safe_cusolver(ans) throw_on_cusolver_error((ans), __FILE__, __LINE__)
inline cusolverStatus_t throw_on_cusolver_error(cusolverStatus_t status, const char* file, int line)
{
if (status != CUSOLVER_STATUS_SUCCESS)
{
std::stringstream ss;
ss << "cusolver error: " << file << "(" << line << ")";
std::string error_text;
ss >> error_text;
//throw error_text;
std::cerr << error_text << std::endl;
exit(-1);
}
return status;
}
#define safe_cusparse(ans) throw_on_cusparse_error((ans), __FILE__, __LINE__)
inline cusparseStatus_t throw_on_cusparse_error(cusparseStatus_t status, const char* file, int line)
{
if (status != CUSPARSE_STATUS_SUCCESS)
{
std::stringstream ss;
ss << "cusparse error: " << file << "(" << line << ")";
std::string error_text;
ss >> error_text;
//throw error_text;
std::cerr << error_text << std::endl;
exit(-1);
}
return status;
}
template <typename T>
void print(thrust::device_vector<T>& v)
{
thrust::device_vector<T> h_v = v;
for (int i = 0; i < h_v.size(); i++)
{
std::cout << h_v[i] << " ";
}
std::cout << "\n";
}
#define TIMERS
struct Timer
{
volatile double start;
Timer() { reset(); }
double seconds_now()
{
#ifdef _WIN32
static LARGE_INTEGER s_frequency;
QueryPerformanceFrequency(&s_frequency);
LARGE_INTEGER now;
QueryPerformanceCounter(&now);
return static_cast<double>(now.QuadPart) / s_frequency.QuadPart;
#else
return 0;
#endif
}
void reset()
{
#ifdef _WIN32
_ReadWriteBarrier();
start = seconds_now();
#endif
}
double elapsed()
{
#ifdef _WIN32
_ReadWriteBarrier();
return seconds_now() - start;
#else
return 0;
#endif
}
void printElapsed(std::string label)
{
#ifdef TIMERS
safe_cuda(cudaDeviceSynchronize());
printf("%s:\t %1.4fs\n", label.c_str(), elapsed());
#endif
}
};
inline double clocks_to_s(clock_t t)
{
return (double)t / CLOCKS_PER_SEC;
}
struct sqr_op
{
__device__ float operator()(float val) const
{
return val * val;
};
};
struct sqrt_op
{
__device__ float operator()(float val) const
{
return sqrt(val);
};
};
/*
* Range iterator
*/
class range
{
public:
class iterator
{
friend class range;
public:
__host__ __device__ int64_t operator*() const { return i_; }
__host__ __device__ const iterator& operator++()
{
i_ += step_;
return *this;
}
__host__ __device__ iterator operator++(int)
{
iterator copy(*this);
i_ += step_;
return copy;
}
__host__ __device__ bool operator==(const iterator& other) const
{
return i_ >= other.i_;
}
__host__ __device__ bool operator!=(const iterator& other) const
{
return i_ < other.i_;
}
__host__ __device__ void step(int s) { step_ = s; }
protected:
__host__ __device__ explicit iterator(int64_t start) : i_(start)
{
}
public:
uint64_t i_;
int step_ = 1;
};
__host__ __device__ iterator begin() const { return begin_; }
__host__ __device__ iterator end() const { return end_; }
__host__ __device__ range(int64_t begin, int64_t end)
: begin_(begin), end_(end)
{
}
__host__ __device__ void step(int s) { begin_.step(s); }
private:
iterator begin_;
iterator end_;
};
template <typename T>
__device__ range grid_stride_range(T begin, T end)
{
begin += blockDim.x * blockIdx.x + threadIdx.x;
range r(begin, end);
r.step(gridDim.x * blockDim.x);
return r;
}
template <typename T>
__device__ range block_stride_range(T begin, T end)
{
begin += threadIdx.x;
range r(begin, end);
r.step(blockDim.x);
return r;
}
// Threadblock iterates over range, filling with value
template <typename IterT, typename ValueT>
__device__ void block_fill(IterT begin, size_t n, ValueT value)
{
for (auto i : block_stride_range(static_cast<size_t>(0), n))
{
begin[i] = value;
}
}
template <typename SrcIterT, typename DestIterT>
__device__ void block_copy(SrcIterT src, DestIterT dest, size_t n)
{
for (auto i : block_stride_range(static_cast<size_t>(0), n))
{
dest[i] = src[i];
}
}
template <typename T>
void tprint(thrust::device_vector<T> &v, const char * label = "")
{
if (strlen(label))
{
printf("%s: ", label);
}
thrust::host_vector<T> h_v = v;
for (int i = 0; i < v.size(); i++)
{
std::cout << h_v[i] << " ";
}
std::cout << "\n";
}
// Keep track of cub library device allocation
struct CubMemory {
void *d_temp_storage;
size_t temp_storage_bytes;
CubMemory() : d_temp_storage(NULL), temp_storage_bytes(0) {}
~CubMemory() {
this->Free();
}
void Free()
{
if (d_temp_storage != NULL) {
safe_cuda(cudaFree(d_temp_storage));
}
temp_storage_bytes = 0;
}
void LazyAllocate( size_t bytes) {
if (bytes > temp_storage_bytes){
this->Free();
temp_storage_bytes = bytes;
safe_cuda(cudaMalloc(&d_temp_storage, temp_storage_bytes));
}
}
bool IsAllocated() { return d_temp_storage != NULL; }
};
inline void generate_column_segments(thrust::device_vector<int>& column_segments, int col_size)
{
auto counting = thrust::make_counting_iterator(0);
thrust::transform(counting, counting + column_segments.size(), column_segments.begin(), [=]__device__(int idx)
{
return idx * col_size;
});
}
}