-
Notifications
You must be signed in to change notification settings - Fork 19
/
ddot_amd64.s
136 lines (110 loc) · 2.44 KB
/
ddot_amd64.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
// func Ddot(N int, X []float64, incX int, Y []float64, incY int) float64
TEXT ·Ddot(SB), 7, $0
MOVQ N+0(FP), BP
MOVQ X_data+8(FP), SI
MOVQ incX+32(FP), AX
MOVQ Y_data+40(FP), DI
MOVQ incY+64(FP), BX
// Check data bounaries
MOVQ BP, CX
DECQ CX
MOVQ CX, DX
IMULQ AX, CX // CX = incX * (N - 1)
IMULQ BX, DX // DX = incY * (N - 1)
CMPQ CX, X_len+16(FP)
JGE panic
CMPQ DX, Y_len+48(FP)
JGE panic
// Clear accumulators
XORPD X0, X0
XORPD X1, X1
// Setup strides
SALQ $3, AX // AX = sizeof(float64) * incX
SALQ $3, BX // BX = sizeof(float64) * incY
// Check that there are 4 or more pairs for SIMD calculations
SUBQ $4, BP
JL rest // There are less than 4 pairs to process
// Check if incX != 1 or incY != 1
CMPQ AX, $8
JNE with_stride
CMPQ BX, $8
JNE with_stride
// Fully optimized loop (for incX == incY == 1)
full_simd_loop:
// Multiply first two pairs
MOVUPD (SI), X2
MOVUPD (DI), X3
MULPD X2, X3
// Multiply second two values
MOVUPD 16(SI), X4
MOVUPD 16(DI), X5
MULPD X4, X5
// Update data pointers
ADDQ $32, SI
ADDQ $32, DI
// Accumulate the results of multiplications
ADDPD X3, X0
ADDPD X5, X1
SUBQ $4, BP
JGE full_simd_loop // There are 4 or more pairs to process
JMP hsum
with_stride:
// Setup long strides
MOVQ AX, CX
MOVQ BX, DX
SALQ $1, CX // CX = 16 * incX
SALQ $1, DX // DX = 16 * incY
// Partially optimized loop
half_simd_loop:
// Multiply first two pairs
MOVLPD (SI), X2
MOVHPD (SI)(AX*1), X2
MOVLPD (DI), X3
MOVHPD (DI)(BX*1), X3
MULPD X2, X3
// Update data pointers using long strides
ADDQ CX, SI
ADDQ DX, DI
// Multiply second two pairs
MOVLPD (SI), X4
MOVHPD (SI)(AX*1), X4
MOVLPD (DI), X5
MOVHPD (DI)(BX*1), X5
MULPD X4, X5
// Update data pointers using long strides
ADDQ CX, SI
ADDQ DX, DI
// Accumulate the results of multiplications
ADDPD X3, X0
ADDPD X5, X1
SUBQ $4, BP
JGE half_simd_loop // There are 4 or more pairs to process
hsum:
// Summ intermediate results from SIMD operations
ADDPD X0, X1
// Horizontal sum
MOVHLPS X1, X0
ADDSD X1, X0
rest:
// Undo last SUBQ
ADDQ $4, BP
// Check that are there any value to process
JE end
loop:
// Multiply one pair
MOVSD (SI), X2
MULSD (DI), X2
// Update data pointers
ADDQ AX, SI
ADDQ BX, DI
// Accumulate the results of multiplication
ADDSD X2, X0
DECQ BP
JNE loop
end:
// Return the sum
MOVSD X0, r+72(FP)
RET
panic:
CALL ·panicIndex(SB)
RET