-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Handle final elements in SpanHelpers.Contains for byte and char vectorized #67492
Handle final elements in SpanHelpers.Contains for byte and char vectorized #67492
Conversation
Tagging subscribers to this area: @dotnet/area-system-memory Issue DetailsDescriptionLet's assume we have a In numbers for byte, AVX2, n = 2, and k = 1:
So as ratio there are The proposed change avoids the sequential processing of the remaining elements by reading a final vector from the end of the (Note: the same / similar approach is used in #67049, and some other places where idempotency can be used (I commented quite a few times on this 😉)) Benchmark resultsNotes
For the benchmarks the machine infoBenchmarkDotNet=v0.13.1, OS=Windows 10.0.19043.1586 (21H1/May2021Update)
Intel Core i7-7700HQ CPU 2.80GHz (Kaby Lake), 1 CPU, 8 logical and 4 physical cores
.NET SDK=7.0.100-preview.4.22181.7
[Host] : .NET 7.0.0 (7.0.22.17907), X64 RyuJIT
DefaultJob : .NET 7.0.0 (7.0.22.17907), X64 RyuJIT bool Contains(ref byte searchSpace, byte value, int length)
bool Contains(ref char searchSpace, char value, int length)
Machine code (x64)SpanHelpers.Contains(byte); SpanHelpersContainsByteBenchmark.Default()
mov rdx,[rcx+8]
movzx eax,byte ptr [rcx+14]
mov r8d,[rcx+10]
mov rcx,rdx
mov edx,eax
jmp qword ptr [7FFB35F51420]
; Total bytes of code 23
; SpanHelpersContainsByteBenchmark.Contains(Byte ByRef, Byte, Int32)
vzeroupper
movzx eax,dl
mov edx,eax
xor r9d,r9d
mov r10d,r8d
mov r11,r10
cmp r8d,40
jl short M01_L00
mov r11,rcx
and r11,1F
neg r11
add r11,20
and r11,1F
M01_L00:
cmp r11,8
jb near ptr M01_L02
M01_L01:
add r11,0FFFFFFFFFFFFFFF8
movzx r8d,byte ptr [rcx+r9]
cmp r8d,edx
je near ptr M01_L09
movzx r8d,byte ptr [rcx+r9+1]
cmp r8d,edx
je near ptr M01_L09
movzx r8d,byte ptr [rcx+r9+2]
cmp r8d,edx
je near ptr M01_L09
movzx r8d,byte ptr [rcx+r9+3]
cmp r8d,edx
je near ptr M01_L09
movzx r8d,byte ptr [rcx+r9+4]
cmp r8d,edx
je near ptr M01_L09
movzx r8d,byte ptr [rcx+r9+5]
cmp r8d,edx
je near ptr M01_L09
movzx r8d,byte ptr [rcx+r9+6]
cmp r8d,edx
je near ptr M01_L09
movzx r8d,byte ptr [rcx+r9+7]
cmp r8d,edx
je near ptr M01_L09
add r9,8
cmp r11,8
jae near ptr M01_L01
M01_L02:
cmp r11,4
jb short M01_L03
add r11,0FFFFFFFFFFFFFFFC
movzx r8d,byte ptr [rcx+r9]
cmp r8d,edx
je near ptr M01_L09
movzx r8d,byte ptr [rcx+r9+1]
cmp r8d,edx
je near ptr M01_L09
movzx r8d,byte ptr [rcx+r9+2]
cmp r8d,edx
je near ptr M01_L09
movzx r8d,byte ptr [rcx+r9+3]
cmp r8d,edx
je near ptr M01_L09
add r9,4
M01_L03:
test r11,r11
je short M01_L05
M01_L04:
dec r11
movzx r8d,byte ptr [rcx+r9]
cmp r8d,edx
je short M01_L09
inc r9
test r11,r11
jne short M01_L04
M01_L05:
cmp r9,r10
jae short M01_L08
mov r11,r10
sub r11,r9
and r11,0FFFFFFFFFFFFFFE0
imul r8d,eax,1010101
vmovd xmm0,r8d
vpbroadcastd ymm0,xmm0
cmp r11,r9
jbe short M01_L07
nop dword ptr [rax]
nop dword ptr [rax+rax]
M01_L06:
vpcmpeqb ymm1,ymm0,[rcx+r9]
vxorps ymm2,ymm2,ymm2
vpxor ymm1,ymm2,ymm1
vptest ymm1,ymm1
jne short M01_L09
add r9,20
cmp r11,r9
ja short M01_L06
M01_L07:
cmp r9,r10
jae short M01_L08
mov r11,r10
sub r11,r9
jmp near ptr M01_L00
M01_L08:
xor eax,eax
vzeroupper
ret
M01_L09:
mov eax,1
vzeroupper
ret
; Total bytes of code 397 ; SpanHelpersContainsByteBenchmark.PR()
mov rdx,[rcx+8]
movzx eax,byte ptr [rcx+14]
mov r8d,[rcx+10]
mov rcx,rdx
mov edx,eax
jmp qword ptr [7FFB35F61438]
; Total bytes of code 23
; SpanHelpersContainsByteBenchmark.Contains_PR(Byte ByRef, Byte, Int32)
push rdi
push rsi
vzeroupper
movzx eax,dl
mov edx,eax
xor r9d,r9d
mov r10d,r8d
mov r11,r10
cmp r8d,40
jl short M01_L00
mov r11,rcx
and r11,1F
neg r11
add r11,20
and r11,1F
M01_L00:
cmp r11,8
jb short M01_L02
nop dword ptr [rax]
M01_L01:
add r11,0FFFFFFFFFFFFFFF8
lea rsi,[rcx+r9]
movzx edi,byte ptr [rsi]
cmp edx,edi
je near ptr M01_L09
movzx edi,byte ptr [rsi+1]
cmp edx,edi
je near ptr M01_L09
movzx edi,byte ptr [rsi+2]
cmp edx,edi
je near ptr M01_L09
movzx edi,byte ptr [rsi+3]
cmp edx,edi
je near ptr M01_L09
movzx edi,byte ptr [rsi+4]
cmp edx,edi
je near ptr M01_L09
movzx edi,byte ptr [rsi+5]
cmp edx,edi
je near ptr M01_L09
movzx edi,byte ptr [rsi+6]
cmp edx,edi
je near ptr M01_L09
movzx esi,byte ptr [rsi+7]
cmp edx,esi
je near ptr M01_L09
add r9,8
cmp r11,8
jae short M01_L01
M01_L02:
cmp r11,4
jb short M01_L03
add r11,0FFFFFFFFFFFFFFFC
lea rsi,[rcx+r9]
movzx edi,byte ptr [rsi]
cmp edx,edi
je near ptr M01_L09
movzx edi,byte ptr [rsi+1]
cmp edx,edi
je near ptr M01_L09
movzx edi,byte ptr [rsi+2]
cmp edx,edi
je near ptr M01_L09
movzx esi,byte ptr [rsi+3]
cmp edx,esi
je near ptr M01_L09
add r9,4
M01_L03:
test r11,r11
je short M01_L05
nop dword ptr [rax+rax]
M01_L04:
dec r11
movzx esi,byte ptr [rcx+r9]
cmp esi,edx
je short M01_L09
inc r9
test r11,r11
jne short M01_L04
M01_L05:
cmp r9,r10
jae short M01_L08
mov r11,r10
sub r11,r9
and r11,0FFFFFFFFFFFFFFE0
vxorps ymm0,ymm0,ymm0
imul eax,1010101
vmovd xmm1,eax
vpbroadcastd ymm1,xmm1
cmp r9,r11
jae short M01_L07
M01_L06:
vpcmpeqb ymm2,ymm1,[rcx+r9]
vpxor ymm2,ymm0,ymm2
vptest ymm2,ymm2
jne short M01_L09
add r9,20
cmp r9,r11
jb short M01_L06
M01_L07:
cmp r9,r10
jae short M01_L08
add r8d,0FFFFFFE0
mov r9d,r8d
vpcmpeqb ymm2,ymm1,[rcx+r9]
vpxor ymm0,ymm0,ymm2
vptest ymm0,ymm0
jne short M01_L09
M01_L08:
xor eax,eax
vzeroupper
pop rsi
pop rdi
ret
M01_L09:
mov eax,1
vzeroupper
pop rsi
pop rdi
ret
; Total bytes of code 389 SpanHelpers.Contains(char); SpanHelpersContainsCharBenchmark.Default()
mov rdx,[rcx+8]
movzx eax,word ptr [rcx+14]
mov r8d,[rcx+10]
mov rcx,rdx
mov edx,eax
jmp qword ptr [7FFB35F51420]
; Total bytes of code 23
; SpanHelpersContainsCharBenchmark.Contains(Char ByRef, Char, Int32)
push rax
vzeroupper
xor eax,eax
mov [rsp],rax
mov [rsp],rcx
movsxd rax,r8d
lea r9,[rcx+rax*2]
cmp r8d,20
jl short M01_L00
mov r8d,ecx
and r8d,1F
mov eax,r8d
shr eax,1F
add eax,r8d
sar eax,1
mov r8d,eax
neg r8d
add r8d,10
and r8d,0F
M01_L00:
cmp r8d,4
jl short M01_L02
movzx r10d,dx
M01_L01:
add r8d,0FFFFFFFC
movzx eax,word ptr [rcx]
cmp r10d,eax
je near ptr M01_L08
movzx eax,word ptr [rcx+2]
cmp r10d,eax
je near ptr M01_L08
movzx eax,word ptr [rcx+4]
cmp r10d,eax
je near ptr M01_L08
movzx eax,word ptr [rcx+6]
cmp r10d,eax
je near ptr M01_L08
add rcx,8
cmp r8d,4
jge short M01_L01
M01_L02:
test r8d,r8d
jle short M01_L04
movzx r10d,dx
nop
M01_L03:
dec r8d
movzx eax,word ptr [rcx]
cmp r10d,eax
je near ptr M01_L08
add rcx,2
test r8d,r8d
jg short M01_L03
M01_L04:
cmp rcx,r9
jae short M01_L07
mov r8,r9
sub r8,rcx
mov rax,r8
shr rax,3F
add rax,r8
sar rax,1
mov r8d,eax
and r8d,0FFFFFFF0
movzx r10d,dx
imul eax,r10d,10001
vmovd xmm0,eax
vpbroadcastd ymm0,xmm0
test r8d,r8d
jle short M01_L06
M01_L05:
vpcmpeqw ymm1,ymm0,[rcx]
vxorps ymm2,ymm2,ymm2
vpxor ymm1,ymm2,ymm1
vptest ymm1,ymm1
jne short M01_L08
add rcx,20
add r8d,0FFFFFFF0
test r8d,r8d
jg short M01_L05
M01_L06:
cmp rcx,r9
jae short M01_L07
mov r8,r9
sub r8,rcx
mov rax,r8
shr rax,3F
add rax,r8
sar rax,1
mov r8d,eax
jmp near ptr M01_L00
M01_L07:
xor eax,eax
vzeroupper
add rsp,8
ret
M01_L08:
mov eax,1
vzeroupper
add rsp,8
ret
; Total bytes of code 311 ; SpanHelpersContainsCharBenchmark.PR()
mov rdx,[rcx+8]
movzx eax,word ptr [rcx+14]
mov r8d,[rcx+10]
mov rcx,rdx
mov edx,eax
jmp qword ptr [7FFB35F51438]
; Total bytes of code 23
; SpanHelpersContainsCharBenchmark.Contains_PR(Char ByRef, Char, Int32)
push rsi
sub rsp,10
vzeroupper
xor eax,eax
mov [rsp+8],rax
mov [rsp+8],rcx
xor r9d,r9d
mov r10d,r8d
mov r11,r10
cmp r8d,20
jl short M01_L00
mov r11d,ecx
and r11d,1F
shr r11d,1
mov eax,r11d
neg eax
add eax,10
and eax,0F
mov r11d,eax
M01_L00:
cmp r11,4
jb short M01_L02
movzx r8d,dx
M01_L01:
add r11,0FFFFFFFFFFFFFFFC
lea rax,[rcx+r9*2]
movzx esi,word ptr [rax]
cmp r8d,esi
je near ptr M01_L08
movzx esi,word ptr [rax+2]
cmp r8d,esi
je near ptr M01_L08
movzx esi,word ptr [rax+4]
cmp r8d,esi
je near ptr M01_L08
movzx eax,word ptr [rax+6]
cmp r8d,eax
je near ptr M01_L08
add r9,4
cmp r11,4
jae short M01_L01
M01_L02:
test r11,r11
je short M01_L04
movzx r8d,dx
nop dword ptr [rax+rax]
nop dword ptr [rax+rax]
M01_L03:
dec r11
movzx eax,word ptr [rcx+r9*2]
cmp eax,r8d
je near ptr M01_L08
inc r9
test r11,r11
jne short M01_L03
M01_L04:
cmp r9,r10
jae short M01_L07
mov r11,r10
sub r11,r9
and r11,0FFFFFFFFFFFFFFF0
vxorps ymm0,ymm0,ymm0
movzx r8d,dx
imul eax,r8d,10001
vmovd xmm1,eax
vpbroadcastd ymm1,xmm1
cmp r9,r11
jae short M01_L06
nop word ptr [rax+rax]
M01_L05:
vpcmpeqw ymm2,ymm1,[rcx+r9*2]
vpxor ymm2,ymm0,ymm2
vptest ymm2,ymm2
jne short M01_L08
add r9,10
cmp r9,r11
jb short M01_L05
M01_L06:
cmp r9,r10
jae short M01_L07
vpcmpeqw ymm2,ymm1,[rcx+r10*2+0FFE0]
vpxor ymm0,ymm0,ymm2
vptest ymm0,ymm0
jne short M01_L08
M01_L07:
xor eax,eax
vzeroupper
add rsp,10
pop rsi
ret
M01_L08:
mov eax,1
vzeroupper
add rsp,10
pop rsi
ret
; Total bytes of code 314 👉 If this looks good, I'd like to look into IndexOf, etc. too.
|
This comment was marked as outdated.
This comment was marked as outdated.
It seems we're missing benchmarks for this? (If so can we add yours?) |
VS didn't do this for comments (at least in my setup) automatically :-(
Sure 😃 dotnet/performance#2347 |
var matches = Vector.Equals(values, LoadVector(ref searchSpace, offset)); | ||
if (Vector<byte>.Zero.Equals(matches)) | ||
matches = Vector.Equals(values, LoadVector(ref searchSpace, offset)); | ||
if (zero.Equals(matches)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
More over you don't need to hoist it - it should not be used
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, vec1 == vec2
emits better code.
For the char-overload:
M01_L05:
vpcmpeqw ymm2,ymm1,[rcx+r9*2]
- vpxor ymm2,ymm0,ymm2
vptest ymm2,ymm2
jne short M01_L08
add r9,10
cmp r9,r11
jb short M01_L05
M01_L06:
cmp r9,r10
jae short M01_L07
vpcmpeqw ymm2,ymm1,[rcx+r10*2+0FFE0]
- vpxor ymm0,ymm0,ymm2
vptest ymm0,ymm0
jne short M01_L08
Thanks for the hint!
Will create an issue for that --> #67500
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thank you @gfoidl !
|
||
if (Vector.IsHardwareAccelerated && length >= Vector<byte>.Count * 2) | ||
{ | ||
lengthToExamine = UnalignedCountVector(ref searchSpace); | ||
} | ||
|
||
SequentialScan: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
a removal of anything goto
-related is always welcomed 👍
@@ -411,10 +413,17 @@ public static bool Contains(ref byte searchSpace, byte value, int length) | |||
goto Found; | |||
} | |||
|
|||
if (offset < (nuint)(uint)length) | |||
// The total length is at least Vector<byte>.Count, so instead of falling back to a |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thank you for adding the comment (otherwise it would not be obvious to me) 👍
Did we expect any perf regressions from this? Seems like it might be related to dotnet/perf-autofiling-issues#4884 |
No regression is expected, rather it should be an improvement. When I check the benchmark-code What is a proper way to investigate this regression? |
Description
Let's assume we have a
searchSpace
of length(n + 1) * Vector<T>.Count - k
, whereT
is eitherbyte
orchar
, andk in (0, Vector<T>.Count)
.So current implementation -- ignoring alignment for a moment -- can perform
n
vectorized operations, then falls back to sequential processing of the remainingVector<T>.Count - k
elements.In numbers for byte, AVX2, n = 2, and k = 1:
So as ratio there are
(Vector<T>.Count - k) / (n * Vector<T>.Count)
elements that need to processed sequential.The worst case is for
k = 1
and smalln
, i.e. for AVX2 andk = 1
31 elements need to be processed sequential.The proposed change avoids the sequential processing of the remaining elements by reading a final vector from the end of the
searchSpace
.When exiting the standard vectorized loop, we know that the
searchSpace
is at leastVector<T>.Count
long, so it is safe to read from that end, and the operation is idempotent too.Thus in total we do
n + 1
vectorized operations.(Note: the same / similar approach is used in #67049, and some other places where idempotency can be used (I commented quite a few times on this 😉))
Benchmark results
Notes
Vector<T>.Zero
outside the loop, this is done manually with this PR and that's why for length 64 (byte) and 32 (char) a speedup is shownFor the benchmarks the
searchSpace
is aligned to 32 bytes, to have reproducable results.machine info
bool Contains(ref byte searchSpace, byte value, int length)
bool Contains(ref char searchSpace, char value, int length)
Machine code (x64)
SpanHelpers.Contains(byte)
SpanHelpers.Contains(char)
👉 If this looks good, I'd like to look into IndexOf, etc. too.