-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhammingDistance_amd64.s
141 lines (120 loc) · 4.33 KB
/
hammingDistance_amd64.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
//; ============================================================================
//; ===[HammingDistance]========================================================
//; ============================================================================
//; SI = Pointer to a
//; DI = Pointer to b
//; R8 = Bytes left to check
//; AX = From hdHuge and onwards AX is used to count the number of mismatching bytes
//; BX, CX, DX, R9 are used as temporary registers
//; func HammingDistance(a, b []byte) int
TEXT ·HammingDistance(SB),7,$0
MOVQ a+0(FP), SI //; pointer to a
MOVQ a+8(FP), CX //; length of a
MOVQ b+24(FP), DI //; pointer to b
MOVQ b+32(FP), R8 //; length of b
//; return value should be saved in res+48(FP)
//; Return if not equal length
CMPQ CX, R8
JNE error
//; return if length is zero
CMPQ R8 ,$0
JE error
XORQ AX, AX
//; If the length of a and b is less than 192 bytes jump to hdSmallEntry
CMPQ R8, $192
JB hdSmallEntry
//; TODO: Make another version of this HammingDistance function without this
//; check and with a good comment that explicitly warns that we need to check if
//; we have all CPU fetures that we need before calling the new function.
//; Check that we have access to all instructions we need
MOVQ $1, AX //; AX=1: Processor Info and Feature Bits
CPUID //; MOVOU (MOVDQU) needs sse2, bit 26 in EDX
//; PCMPEQB needs sse2, bit 26 in EDX
//; PMOVMSKB needs sse2, bit 26 in EDX
//; sse2 came with the Pentium 4 in 2000
//; POPCNTQ needs popcnt bit 23 ECX
//; The popcnt CPUID feture is the newest feture that we need.
//; popcnt came with the Nehalem architecture in 2008.
ANDQ $0x4000000, DX //; AND against bit 26 in EDX
JZ missingSSE2
ANDQ $0x800000, CX //; AND against bit 23 in ECX
JZ missingPOPCNTQ
//; AX is used to count the number of mismatching bytes
XORQ AX, AX //; set AX to zero
hdHuge:
//; Copy in 64 bytes of a and b in to the xmm registers X0 to X7
MOVOU (SI), X0 //; MOVDQU on Sandy Bridge (m128, xmm) 1 cycles/instructions and 3 in latency
MOVOU 16(SI), X2
MOVOU 32(SI), X4
MOVOU 48(SI), X6
MOVOU (DI), X1
MOVOU 16(DI), X3
MOVOU 32(DI), X5
MOVOU 48(DI), X7
//; PCMPEQB will set X0, X2, X4 and X6 to only contain 0xFF if corresponding
//; bytes in a and b are the same.
PCMPEQB X1, X0 //; Sandy Bridge (xmm xmm) 0.5 cycles/instructions and 1 in latency
PCMPEQB X3, X2
PCMPEQB X5, X4
PCMPEQB X7, X6
//; PMOVMSKB takes the highest bit from every byte in X0, X2, X4 and X6
//; respectivly and copys them in to the 16 least significant bits of
//; R9, BX, CX and DX respectivly.
PMOVMSKB X0, R9 //; Sandy Bridge (reg xmm) 1 cycles/instructions and 2 in latency
PMOVMSKB X2, BX
PMOVMSKB X4, CX
PMOVMSKB X6, DX
//; Move all 64 resulting bits in to R9
SHLQ $48, R9 //; XX______ Sandy Bridge (reg im) 0.5 cycles/instructions and 1 in latency
SHRQ $16, R9:BX //; XXXX____ Sandy Bridge (reg im im) 0.5 cycles/instructions
SHRQ $16, R9:CX //; XXXXXX__
SHRQ $16, R9:DX //; XXXXXXXX
//; Currently all set bits in R9 represents equal bytes in a and b in the
//; 64 bytes currently being processed
//; After we NOT R9 then all set bits in R9 represents bytes that are not equal.
NOTQ R9
//; Clear BX so we can add the POPCNTQ result directly to AX (also break
//; dependency chain as POPCNTQ has a false dependency on dest)
XORQ BX, BX
//; Count the number of set bits and save to AX
POPCNTQ R9, BX
ADDQ BX, AX
//; Update the pointers and counter then jump back to the start of the loop.
ADDQ $64, SI //; (ADD and SUB) Sandy Bridge (reg im) 0.33 cycles/instructions and 1 in latency
ADDQ $64, DI
SUBQ $64, R8
CMPQ R8, $64
JNB hdHuge
//; End of hdHuge loop, there is now 63 bytes or less left to check
JMP hdSmallEntry
//; Check one byte at a time and ADD 1 for every byte that is not equal
hdSmallnoAdd:
ADDQ $1, DI
ADDQ $1, SI
SUBQ $1, R8
hdSmallEntry:
CMPB R8, $0
JE hdRet
hdSmall:
MOVB (SI), CX
CMPB CX, (DI)
JE hdSmallnoAdd
ADDQ $1, AX
ADDQ $1, DI
ADDQ $1, SI
SUBQ $1, R8
CMPB R8, $0
JNE hdSmall
hdRet:
MOVQ AX, res+48(FP)
RET
missingPOPCNTQ:
//; Not yet implemented
missingSSE2:
//; Not yet implemented
error:
MOVQ $-1, res+48(FP)
RET
//; ============================================================================
//; ===[End of HammingDistance]=================================================
//; ============================================================================