-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.cu
136 lines (127 loc) · 4.29 KB
/
main.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#include<cuda_runtime.h>
#include<thrust/scan.h>
#include<thrust/functional.h>
#include<iostream>
int add_(int a, int b)
{
return a+b;
}
__device__ int log_plus(int a, int b)
{
return a+b;
}
__device__ int segscan_warp(int* ptr, bool* hd, int idx) {
const unsigned int lane = idx & 31;
if (lane >= 1) {
ptr[idx] = hd[idx] ? ptr[idx] : log_plus(ptr[idx - 1] , ptr[idx]);
hd[idx] = hd[idx - 1] | hd[idx]; }
if (lane >= 2) {
ptr[idx] = hd[idx] ? ptr[idx] : log_plus(ptr[idx - 2] , ptr[idx]);
hd[idx] = hd[idx - 2] | hd[idx]; }
if (lane >= 4) {
ptr[idx] = hd[idx] ? ptr[idx] : log_plus(ptr[idx - 4] , ptr[idx]);
hd[idx] = hd[idx - 4] | hd[idx]; }
if (lane >= 8) {
ptr[idx] = hd[idx] ? ptr[idx] : log_plus(ptr[idx - 8] , ptr[idx]);
hd[idx] = hd[idx - 8] | hd[idx]; }
if (lane >= 16) {
ptr[idx] = hd[idx] ? ptr[idx] : log_plus(ptr[idx - 16] , ptr[idx]);
hd[idx] = hd[idx - 16] | hd[idx];
}
return ptr[idx];
}
__device__ void segscan_block(int* ptr, bool* hd, int idx)
{
unsigned int warpid = idx >> 5;
unsigned int warp_first = warpid << 5;
unsigned int warp_last = warp_first + 31;
// Step 1a:
// Before overwriting the input head flags, record whether // this warp begins with an "open" segment.
bool warp_is_open = (hd[warp_first] == 0);
__syncthreads ();
// Step 1b:
// Intra-warp segmented scan in each warp.
int val = segscan_warp(ptr, hd, idx);
// Step 2a:
// Since ptr[] contains *inclusive* results, irrespective of Kind, // the last value is the correct partial result.
int warp_total = ptr[warp_last];
// Step 2b:
// warp_flag is the OR-reduction of the flags in a warp and is
// computed indirectly from the mindex values in hd[].
// will_accumulate indicates that a thread will only accumulate a
// partial result in Step 4 if there is no segment boundary to its left.
bool warp_flag = hd[warp_last]!=0 || !warp_is_open;
bool will_accumulate = warp_is_open && hd[idx]==0;
__syncthreads ();
// Step 2c: The last thread in each warp writes partial results
if( idx == warp_last ) {
ptr[warpid] = warp_total;
hd[warpid] = warp_flag;
}
__syncthreads ();
// Step 3: One warp scans the per-warp results
if( warpid == 0 ) segscan_warp(ptr, hd, idx);
__syncthreads ();
// Step 4: Accumulate results from
if( warpid != 0 && will_accumulate)
val = log_plus(ptr[warpid -1], val);
__syncthreads ();
ptr[idx] = val;
__syncthreads ();
}
__global__ void kernel_1(int* array, int size, bool* key)
{
int idx=threadIdx.x;
int stt=blockIdx.x;
segscan_block(array+stt*1838, key+stt*1838, idx);
__syncthreads();
if(threadIdx.x==0&&key[1024+stt*1838]==0)
{
key[1024+stt*1838]=1;
array[1024+stt*1838]+=array[1023+stt*1838];
}
__syncthreads();
if(threadIdx.x+1024<size)
{
segscan_block(array+1024+stt*1838, key+1024+stt*1838, idx);
}
}
int main()
{
int* data;
bool* keys;
int* vals;
int* thrust_keys;
int n=10;
int array_size=1838*n;
int num_of_rule=1838;
printf("%f\n",-std::numeric_limits<float>::max());
cudaMallocManaged(&thrust_keys, array_size*sizeof(int));
cudaMallocManaged(&data, array_size*sizeof(int));
cudaMallocManaged(&keys, array_size*sizeof(bool));
cudaMallocManaged(&vals, array_size*sizeof(int));
for(int i=0;i<num_of_rule;i++)
{
for(int j=0;j<n;j++)
{
data[i+j*num_of_rule] = 1;
thrust_keys[i+j*num_of_rule]=i/10;
if(i%10==0) keys[i+j*num_of_rule]=1;
else keys[i+j*num_of_rule]=0;
}
}
thrust::equal_to<int> binary_pred;
for(int i=0;i<n;i++)
thrust::inclusive_scan_by_key(thrust_keys+i*num_of_rule, thrust_keys +i*num_of_rule+ 1838, data+i*num_of_rule, vals+i*num_of_rule, binary_pred, add_);
dim3 grid(n);
dim3 block(1024);
dim3 block2(1838-1024);
kernel_1<<< grid, block >>>(data, 1838, keys);
cudaDeviceSynchronize();
for(int i=0;i<1838*n;i++) {
if(vals[i]!=data[i]) printf("%d ", i);
if((vals[i]-1)%10!=(i%1838)%10) printf("%d vs %d\n", vals[i], i);
}
printf("\n");
return 0;
}