-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathaes.cu
164 lines (126 loc) · 4.46 KB
/
aes.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#include "aes_table.h"
// CUDA AES implementation courtesy of Andrew Pratt
// https://www.andrew.cmu.edu/user/aspratt/accelerated_aes/
// No license specified, so uh, hope he doesn't mind?
//XOR round key with block(1 block per thread)
__device__ void add_round_key(uint8_t *block, const uint8_t *key, uint32_t offset){
//word size traversal
uint32_t *b = (uint32_t *)block;
uint32_t *k = (uint32_t *)key;
for(int i = 0; i < 4; ++i){
b[offset/4 + i] = b[offset/4 + i] ^ k[i];
}
}
//substitute block int sbox (1 block per thread)
__device__ void sub_bytes(uint8_t *block, uint32_t offset){
for(int i = 0; i < 16; ++i){
block[offset + i] = sbox[block[offset + i]];
}
}
//mix columns by taking linear combinations in the field (1 block per thread)
__device__ void mix_columns(uint8_t *block, uint32_t offset){
for(int i = 0; i < 4; ++i){ //iterate over columns
uint8_t a[4];
uint8_t b[4];
uint8_t h;
for(int j = 0; j < 4; ++j){
a[j] = block[offset + 4*i + j];
h = (uint8_t)((int8_t)a[j] >> 7);
b[j] = a[j] << 1;
b[j] ^= 0x1b & h;
}
block[offset + 4*i + 0] = b[0] ^ a[3] ^ a[2] ^ b[1] ^ a[1];
block[offset + 4*i + 1] = b[1] ^ a[0] ^ a[3] ^ b[2] ^ a[2];
block[offset + 4*i + 2] = b[2] ^ a[1] ^ a[0] ^ b[3] ^ a[3];
block[offset + 4*i + 3] = b[3] ^ a[2] ^ a[1] ^ b[0] ^ a[0];
}
}
//shift rows left by 0,1,2,3 bytes respectively (1 block per thread)
__device__ void shift_rows(uint8_t *sblock, uint32_t offset){
uint8_t tmp;
uint8_t *block = sblock + offset;
//row 0 remains unshifted
//shift row 1 left by 1
tmp = block[1];
block[1] = block[5];
block[5] = block[9];
block[9] = block[13];
block[13] = tmp;
//shift row 2 letf by 2
tmp = block[2];
block[2] = block[10];
block[10] = tmp;
tmp = block[6];
block[6] = block[14];
block[14] = tmp;
//shift row 3 left by 3
tmp = block[3];
block[3] = block[15];
block[15] = block[11];
block[11] = block[7];
block[7] = tmp;
}
//aes 128 encryption with expanded key supplied
//implemented using 1 t-tables (with rotation) and sbox
//1 block per thread
__device__ void encrypt_one_table(uint8_t *block, const uint8_t *rkey, uint32_t offset){
uint8_t *b = (block + offset);
uint32_t *bword = (uint32_t *)(block + offset);
add_round_key(block, rkey, offset);
for(int i = 1; i < 3; ++i){
uint32_t *ckey = (uint32_t *)(rkey + 16*i);
uint32_t c1 = te0[b[0]] ^ (te0[b[5]]<<8 | te0[b[5]]>>24) ^ (te0[b[10]]<<16 | te0[b[10]]>>16) ^ (te0[b[15]]<<24 | te0[b[15]]>>8) ^ ckey[0];
uint32_t c2 = te0[b[4]] ^ (te0[b[9]]<<8 | te0[b[9]]>>24) ^ (te0[b[14]]<<16 | te0[b[14]]>>16) ^ (te0[b[3]]<<24 | te0[b[3]]>>8) ^ ckey[1];
uint32_t c3 = te0[b[8]] ^ (te0[b[13]]<<8 | te0[b[13]]>>24) ^ (te0[b[2]]<<16 | te0[b[2]]>>16) ^ (te0[b[7]]<<24 | te0[b[7]]>>8) ^ ckey[2];
uint32_t c4 = te0[b[12]] ^ (te0[b[1]]<<8 | te0[b[1]]>>24) ^ (te0[b[6]]<<16 | te0[b[6]]>>16) ^ (te0[b[11]]<<24 | te0[b[11]]>>8) ^ ckey[3];
bword[0] = c1;
bword[1] = c2;
bword[2] = c3;
bword[3] = c4;
}
sub_bytes(block, offset);
shift_rows(block, offset);
add_round_key(block, rkey + 160, offset);
}
//basic encryption kernel. Unused for ctr mode encryption
__device__ void encrypt_k(uint8_t *data, const uint8_t *rkey, uint32_t numblock){
int bindex = blockIdx.x * blockDim.x + threadIdx.x;
int offset = bindex * 16;
if(bindex >= numblock) return;
encrypt_one_table(data, rkey, offset);
}
//generate round keys from initial key
__host__ void expand_key(uint8_t *key, uint8_t *rkey){
uint32_t i,j,k;
uint8_t tempa[4];
uint32_t nround = 10;
//first round key is just the key
for(i = 0; i < 4; ++i){
rkey[4*i + 0] = key[4*i + 0];
rkey[4*i + 1] = key[4*i + 1];
rkey[4*i + 2] = key[4*i + 2];
rkey[4*i + 3] = key[4*i + 3];
}
for(i = 4; i < 4*(nround + 1); ++i){
for(j = 0; j < 4; ++j){
tempa[j] = rkey[(i-1)*4 + j];
}
if(i % 4 == 0){
//rotate 4 bytes in word
k = tempa[0];
tempa[0] = tempa[1];
tempa[1] = tempa[2];
tempa[2] = tempa[3];
tempa[3] = k;
tempa[0] = sbox_host[tempa[0]];
tempa[1] = sbox_host[tempa[1]];
tempa[2] = sbox_host[tempa[2]];
tempa[3] = sbox_host[tempa[3]];
tempa[0] = tempa[0] ^ rcon[i/4];
}
rkey[4*i + 0] = rkey[4*(i-4) + 0] ^ tempa[0];
rkey[4*i + 1] = rkey[4*(i-4) + 1] ^ tempa[1];
rkey[4*i + 2] = rkey[4*(i-4) + 2] ^ tempa[2];
rkey[4*i + 3] = rkey[4*(i-4) + 3] ^ tempa[3];
}
}