-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmatrix3.c
63 lines (54 loc) · 1.52 KB
/
matrix3.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
/* This program is basically the one in the paper /What Every
Programmer Should Know About Memory/ by Ulrich Drepper, section
6.2. See the paper for more information. */
#include <stdlib.h>
#include <emmintrin.h>
#include "matrix.h"
#define SM (CLS / sizeof (double))
void *
prepare (int argc, char *argv[])
{
#ifdef CHECK_RESULT
init_matrix (mul1);
init_matrix (mul2);
reset_matrix (res);
#endif
return NULL;
}
int
run (int argc, char *argv[], void *unused)
{
int i, i2, j, j2, k, k2;
double *restrict rres;
double *restrict rmul1;
double *restrict rmul2;
for (i = 0; i < N; i += SM)
for (j = 0; j < N; j += SM)
for (k = 0; k < N; k += SM)
for (i2 = 0, rres = &res[i][j], rmul1 = &mul1[i][k]; i2 < SM;
++i2, rres += N, rmul1 += N)
{
_mm_prefetch (&rmul1[8], _MM_HINT_NTA);
for (k2 = 0, rmul2 = &mul2[k][j]; k2 < SM; ++k2, rmul2 += N)
{
__m128d m1d = _mm_load_sd (&rmul1[k2]);
m1d = _mm_unpacklo_pd (m1d, m1d);
for (j2 = 0; j2 < SM; j2 += 2)
{
__m128d m2 = _mm_load_pd (&rmul2[j2]);
__m128d r2 = _mm_load_pd (&rres[j2]);
_mm_store_pd (&rres[j2],
_mm_add_pd (_mm_mul_pd (m2, m1d), r2));
}
}
}
return 0;
}
int
postrun (int argc, char *argv[], void *unused)
{
#ifdef CHECK_RESULT
printf ("%.0f\n", sum_matrix (res));
#endif
return 0;
}