-
Notifications
You must be signed in to change notification settings - Fork 0
/
matrix_int16.c
110 lines (89 loc) · 2.28 KB
/
matrix_int16.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#include <x86intrin.h>
#include "perf.h"
#define col 1024 * 4
#define row 64
int16_t mat[row][col];
int16_t mutiplier[col];
int16_t result[row];
#define DEBUG { \
for (int i = 0; i < 8; i++) \
printf("%d, ", result[i]); \
printf("...\n\n"); \
}
void init(void) {
srand(time(0));
for (int i = 0; i < row; i++) {
for (int j = 0; j < col; j++) {
mat[i][j] = (int16_t)(rand() % 32);
}
}
for (int j = 0; j < col; j++) {
mutiplier[j] = (int16_t)(rand() % 32);
}
}
void calc_non(void) {
for(int j = 0; j < row; j++) {
int16_t sum = 0;
int16_t *vec = mat[j];
for(int i = 0; i < col; i++)
sum += vec[i] * mutiplier[i];
result[j] = sum;
}
}
void calc_avx2(void) {
#define VLEN (256/8/sizeof(int16_t))
const int col_aligned = col - col % VLEN;
int16_t dst_arr[VLEN];
__m256i op0, op1, tmp_vec;
volatile __m256i tgt;
for (int i = 0; i < row; i++) {
int16_t sum = 0;
tgt = _mm256_setzero_si256();
for (int j = 0; j < col_aligned; j += VLEN) {
op0 = _mm256_load_si256((__m256i *)&mutiplier[j]);
op1 = _mm256_load_si256((__m256i *)&mat[i][j]);
tmp_vec = _mm256_mullo_epi16(op0, op1);
tgt = _mm256_add_epi16(tmp_vec, tgt);
}
_mm256_store_epi64(&dst_arr, tgt);
for (int k = 0; k < VLEN; k++)
sum += dst_arr[k];
for (int l = col_aligned; l < col; l++)
sum += mat[i][l] * mutiplier[l];
result[i] = sum;
}
#undef VLEN
}
void calc_avx512(void) {
#define VLEN (512/8/sizeof(int16_t))
const int col_aligned = col - col % VLEN;
int16_t dst_arr[VLEN];
__m512i op0, op1, tmp_vec;
volatile __m512i tgt;
for (int i = 0; i < row; i++) {
int sum = 0;
tgt = _mm512_setzero_si512();
for (int j = 0; j < col_aligned; j += VLEN) {
op0 = _mm512_load_si512(&mutiplier[j]);
op1 = _mm512_load_si512(&mat[i][j]);
tmp_vec = _mm512_mullo_epi16(op0, op1);
tgt = _mm512_add_epi16(tmp_vec, tgt);
}
_mm512_store_epi64(dst_arr, tgt);
for (int k = 0; k < VLEN; k++)
sum += dst_arr[k];
for (int l = col_aligned; l < col; l++)
sum += mat[i][l] * mutiplier[l];
result[i] = sum;
}
#undef VLEN
}
int main() {
init();
TIME(LOOP(calc_non()));
DEBUG
TIME(LOOP(calc_avx2()));
DEBUG
TIME(LOOP(calc_avx512()));
DEBUG
}