-
Notifications
You must be signed in to change notification settings - Fork 4.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve Span.Reverse fast path performance #70944
Conversation
Tagging subscribers to this area: @dotnet/area-system-memory Issue DetailsSpan<T>.Reverse was vectorized earlier this year Notible changes:
Benchmarks: Span.Reverse<long> Assembly diff for Reverse longBefore: ; System.SpanHelpers.Reverse(Int64 ByRef, UIntPtr)
push rsi
vzeroupper
cmp rdx,8
jb short M01_L02
nop dword ptr [rax]
mov rax,rdx
shr rax,2
shr rax,1
xor r8d,r8d
test rax,rax
jbe short M01_L01
M01_L00:
mov r9,r8
shl r9,2
inc r8
mov r10,r8
shl r10,2
mov r11,rdx
sub r11,r10
vmovdqu ymm0,ymmword ptr [rcx+r9*8]
vmovdqu ymm1,ymmword ptr [rcx+r11*8]
vpermq ymm0,ymm0,1B
vpermq ymm1,ymm1,1B
vmovdqu ymmword ptr [rcx+r9*8],ymm1
vmovdqu ymmword ptr [rcx+r11*8],ymm0
cmp r8,rax
jb short M01_L00
M01_L01:
shl rax,2
lea rcx,[rcx+rax*8]
add rax,rax
sub rdx,rax
jmp short M01_L05
nop dword ptr [rax]
nop
M01_L02:
cmp rdx,4
jb short M01_L05
lea rax,[rdx+rdx]
mov r8,rax
shr r8,2
shr r8,1
xor r9d,r9d
test r8,r8
jbe short M01_L04
M01_L03:
mov r10,r9
shl r10,2
inc r9
mov r11,r9
shl r11,2
mov rsi,rax
sub rsi,r11
vmovdqu xmm0,xmmword ptr [rcx+r10*4]
vmovdqu xmm1,xmmword ptr [rcx+rsi*4]
vpshufd xmm0,xmm0,4E
vpshufd xmm1,xmm1,4E
vmovdqu xmmword ptr [rcx+r10*4],xmm1
vmovdqu xmmword ptr [rcx+rsi*4],xmm0
cmp r9,r8
jb short M01_L03
M01_L04:
mov rax,r8
shl rax,2
lea rcx,[rcx+rax*4]
add r8,r8
add r8,r8
sub rdx,r8
M01_L05:
xor eax,eax
mov r8,rdx
shr r8,1
je short M01_L07
dec rdx
M01_L06:
lea r9,[rcx+rax*8]
mov r10,rdx
sub r10,rax
lea r10,[rcx+r10*8]
mov r11,[r9]
mov rsi,[r10]
mov [r10],r11
mov [r9],rsi
inc rax
cmp r8,rax
ja short M01_L06
M01_L07:
vzeroupper
pop rsi
ret
; Total bytes of code 276 After: ; System.SpanHelpers.Reverse(Int64 ByRef, UIntPtr)
vzeroupper
cmp rdx,8
jb short M01_L01
movsxd rax,edx
lea rax,[rcx+rax*8+0FFE0]
M01_L00:
vmovdqu ymm0,ymmword ptr [rcx]
vmovdqu ymm1,ymmword ptr [rax]
vpermq ymm0,ymm0,1B
vpermq ymm1,ymm1,1B
vmovdqu ymmword ptr [rcx],ymm1
vmovdqu ymmword ptr [rax],ymm0
add rcx,20
add rax,0FFFFFFFFFFFFFFE0
add rdx,0FFFFFFFFFFFFFFF8
cmp rdx,8
jae short M01_L00
jmp short M01_L03
M01_L01:
cmp rdx,4
jb short M01_L03
movsxd rax,edx
lea rax,[rcx+rax*8+0FFF0]
M01_L02:
vmovdqu xmm0,xmmword ptr [rcx]
vmovdqu xmm1,xmmword ptr [rax]
vpshufd xmm0,xmm0,4E
vpshufd xmm1,xmm1,4E
vmovdqu xmmword ptr [rcx],xmm1
vmovdqu xmmword ptr [rax],xmm0
add rcx,10
add rax,0FFFFFFFFFFFFFFF0
add rdx,0FFFFFFFFFFFFFFFC
cmp rdx,4
jae short M01_L02
M01_L03:
movsxd rax,edx
lea rax,[rcx+rax*8+0FFF8]
cmp rcx,rax
jae short M01_L05
nop dword ptr [rax]
nop dword ptr [rax]
M01_L04:
mov rdx,[rcx]
mov r8,[rax]
mov [rax],rdx
mov [rcx],r8
add rcx,8
add rax,0FFFFFFFFFFFFFFF8
cmp rcx,rax
jb short M01_L04
M01_L05:
vzeroupper
ret
; Total bytes of code 189
Span.Reverse<int> Assembly diff for Reverse intBefore: ; System.SpanHelpers.Reverse(Int32 ByRef, UIntPtr)
push rsi
vzeroupper
cmp rdx,10
jb short M01_L02
mov rax,rdx
shr rax,3
shr rax,1
vmovupd ymm0,[7FFC8796E760]
xor r8d,r8d
test rax,rax
jbe short M01_L01
M01_L00:
mov r9,r8
shl r9,3
inc r8
mov r10,r8
shl r10,3
mov r11,rdx
sub r11,r10
vmovdqu ymm1,ymmword ptr [rcx+r9*4]
vmovdqu ymm2,ymmword ptr [rcx+r11*4]
vpermd ymm1,ymm0,ymm1
vpermd ymm2,ymm0,ymm2
vmovdqu ymmword ptr [rcx+r9*4],ymm2
vmovdqu ymmword ptr [rcx+r11*4],ymm1
cmp r8,rax
jb short M01_L00
M01_L01:
shl rax,3
lea rcx,[rcx+rax*4]
add rax,rax
sub rdx,rax
jmp short M01_L05
M01_L02:
cmp rdx,8
jb short M01_L05
mov rax,rdx
shr rax,2
shr rax,1
xor r8d,r8d
test rax,rax
jbe short M01_L04
M01_L03:
mov r9,r8
shl r9,2
inc r8
mov r10,r8
shl r10,2
mov r11,rdx
sub r11,r10
vmovdqu xmm0,xmmword ptr [rcx+r9*4]
vmovdqu xmm1,xmmword ptr [rcx+r11*4]
vpshufd xmm0,xmm0,1B
vpshufd xmm1,xmm1,1B
vmovdqu xmmword ptr [rcx+r9*4],xmm1
vmovdqu xmmword ptr [rcx+r11*4],xmm0
cmp r8,rax
jb short M01_L03
M01_L04:
shl rax,2
lea rcx,[rcx+rax*4]
add rax,rax
sub rdx,rax
M01_L05:
xor eax,eax
mov r8,rdx
shr r8,1
je short M01_L07
dec rdx
M01_L06:
lea r9,[rcx+rax*4]
mov r10,rdx
sub r10,rax
lea r10,[rcx+r10*4]
mov r11d,[r9]
mov esi,[r10]
mov [r10],r11d
mov [r9],esi
inc rax
cmp r8,rax
ja short M01_L06
M01_L07:
vzeroupper
pop rsi
ret
; Total bytes of code 266 After: ; System.SpanHelpers.Reverse(Int32 ByRef, UIntPtr)
vzeroupper
cmp rdx,10
jb short M01_L01
vmovupd ymm0,[7FFC8E2FD7E0]
movsxd rax,edx
lea rax,[rcx+rax*4+0FFE0]
nop word ptr [rax+rax]
M01_L00:
vmovdqu ymm1,ymmword ptr [rcx]
vmovdqu ymm2,ymmword ptr [rax]
vpermd ymm1,ymm0,ymm1
vpermd ymm2,ymm0,ymm2
vmovdqu ymmword ptr [rcx],ymm2
vmovdqu ymmword ptr [rax],ymm1
add rcx,20
add rax,0FFFFFFFFFFFFFFE0
add rdx,0FFFFFFFFFFFFFFF0
cmp rdx,10
jae short M01_L00
jmp short M01_L03
M01_L01:
cmp rdx,8
jb short M01_L03
movsxd rax,edx
lea rax,[rcx+rax*4+0FFF0]
M01_L02:
vmovdqu xmm0,xmmword ptr [rcx]
vmovdqu xmm1,xmmword ptr [rax]
vpshufd xmm0,xmm0,1B
vpshufd xmm1,xmm1,1B
vmovdqu xmmword ptr [rcx],xmm1
vmovdqu xmmword ptr [rax],xmm0
add rcx,10
add rax,0FFFFFFFFFFFFFFF0
add rdx,0FFFFFFFFFFFFFFF8
cmp rdx,8
jae short M01_L02
M01_L03:
movsxd rax,edx
lea rax,[rcx+rax*4+0FFFC]
cmp rcx,rax
jae short M01_L05
nop
M01_L04:
mov edx,[rcx]
mov r8d,[rax]
mov [rax],edx
mov [rcx],r8d
add rcx,4
add rax,0FFFFFFFFFFFFFFFC
cmp rcx,rax
jb short M01_L04
M01_L05:
vzeroupper
ret
; Total bytes of code 187
Span.Reverse<char> Assembly diff for Reverse charBefore: ; System.SpanHelpers.Reverse(Char ByRef, UIntPtr)
push rsi
vzeroupper
lea rax,[rdx+rdx]
cmp rdx,20
jb near ptr M01_L02
vmovupd ymm0,[7FFC8A01D780]
mov r8,rax
shr r8,5
shr r8,1
xor r9d,r9d
test r8,r8
jbe short M01_L01
M01_L00:
mov r10,r9
shl r10,5
inc r9
mov r11,r9
shl r11,5
mov rsi,rax
sub rsi,r11
vmovdqu ymm1,ymmword ptr [rcx+r10]
vmovdqu ymm2,ymmword ptr [rcx+rsi]
vpshufb ymm1,ymm1,ymm0
vperm2i128 ymm1,ymm1,ymm1,1
vpshufb ymm2,ymm2,ymm0
vperm2i128 ymm2,ymm2,ymm2,1
vmovdqu ymmword ptr [rcx+r10],ymm2
vmovdqu ymmword ptr [rcx+rsi],ymm1
cmp r9,r8
jb short M01_L00
M01_L01:
mov rax,r8
shl rax,5
add rcx,rax
shl r8,4
add r8,r8
sub rdx,r8
jmp short M01_L05
M01_L02:
cmp rdx,10
jb short M01_L05
vmovupd xmm0,[7FFC8A01D780]
lea r8,[rdx+rdx]
shr r8,4
shr r8,1
xor r9d,r9d
test r8,r8
jbe short M01_L04
M01_L03:
mov r10,r9
shl r10,4
inc r9
mov r11,r9
shl r11,4
mov rsi,rax
sub rsi,r11
vmovdqu xmm1,xmmword ptr [rcx+r10]
vmovdqu xmm2,xmmword ptr [rcx+rsi]
vpshufb xmm1,xmm1,xmm0
vpshufb xmm2,xmm2,xmm0
vmovdqu xmmword ptr [rcx+r10],xmm2
vmovdqu xmmword ptr [rcx+rsi],xmm1
cmp r9,r8
jb short M01_L03
M01_L04:
mov rax,r8
shl rax,4
add rax,rcx
mov rcx,rax
shl r8,3
add r8,r8
sub rdx,r8
M01_L05:
xor eax,eax
mov r8,rdx
shr r8,1
je short M01_L07
dec rdx
M01_L06:
lea r9,[rcx+rax*2]
mov r10,rdx
sub r10,rax
lea r10,[rcx+r10*2]
movzx r11d,word ptr [r9]
movzx esi,word ptr [r10]
mov [r10],r11w
mov [r9],si
inc rax
cmp r8,rax
ja short M01_L06
M01_L07:
vzeroupper
pop rsi
ret
; Total bytes of code 310 After: ; System.SpanHelpers.Reverse(Char ByRef, UIntPtr)
vzeroupper
cmp rdx,20
jb short M01_L01
movsxd rax,edx
lea rax,[rcx+rax*2+0FFE0]
vmovupd ymm0,[7FFC89FFD880]
nop word ptr [rax+rax]
M01_L00:
vmovdqu ymm1,ymmword ptr [rcx]
vmovdqu ymm2,ymmword ptr [rax]
vpshufb ymm1,ymm1,ymm0
vperm2i128 ymm1,ymm1,ymm1,1
vpshufb ymm2,ymm2,ymm0
vperm2i128 ymm2,ymm2,ymm2,1
vmovdqu ymmword ptr [rcx],ymm2
vmovdqu ymmword ptr [rax],ymm1
add rcx,20
add rax,0FFFFFFFFFFFFFFE0
add rdx,0FFFFFFFFFFFFFFE0
cmp rdx,20
jae short M01_L00
M01_L01:
cmp rdx,10
jb short M01_L03
movsxd rax,edx
lea rax,[rcx+rax*2+0FFF0]
vmovupd xmm0,[7FFC89FFD880]
M01_L02:
vmovdqu xmm1,xmmword ptr [rcx]
vmovdqu xmm2,xmmword ptr [rax]
vpshufb xmm1,xmm1,xmm0
vpshufb xmm2,xmm2,xmm0
vmovdqu xmmword ptr [rcx],xmm2
vmovdqu xmmword ptr [rax],xmm1
add rcx,10
add rax,0FFFFFFFFFFFFFFF0
add rdx,0FFFFFFFFFFFFFFF0
cmp rdx,10
jae short M01_L02
M01_L03:
movsxd rax,edx
lea rax,[rcx+rax*2+0FFFE]
cmp rcx,rax
jae short M01_L05
M01_L04:
movzx edx,word ptr [rcx]
movzx r8d,word ptr [rax]
mov [rax],dx
mov [rcx],r8w
add rcx,2
add rax,0FFFFFFFFFFFFFFFE
cmp rcx,rax
jb short M01_L04
M01_L05:
vzeroupper
ret
; Total bytes of code 198
Span.Reverse<byte> Assembly diff for Reverse byteBefore: ; System.SpanHelpers.Reverse(Byte ByRef, UIntPtr)
push rsi
vzeroupper
cmp rdx,40
jb short M01_L02
vmovupd ymm0,[7FFC840ECD00]
mov rax,rdx
shr rax,5
shr rax,1
xor r8d,r8d
test rax,rax
jbe short M01_L01
M01_L00:
mov r9,r8
shl r9,5
inc r8
mov r10,r8
shl r10,5
mov r11,rdx
sub r11,r10
vmovdqu ymm1,ymmword ptr [rcx+r9]
vmovdqu ymm2,ymmword ptr [rcx+r11]
vpshufb ymm1,ymm1,ymm0
vperm2i128 ymm1,ymm1,ymm1,1
vpshufb ymm2,ymm2,ymm0
vperm2i128 ymm2,ymm2,ymm2,1
vmovdqu ymmword ptr [rcx+r9],ymm2
vmovdqu ymmword ptr [rcx+r11],ymm1
cmp r8,rax
jb short M01_L00
M01_L01:
shl rax,5
add rcx,rax
add rax,rax
sub rdx,rax
jmp short M01_L05
M01_L02:
cmp rdx,20
jb short M01_L05
vmovupd xmm0,[7FFC840ECD00]
mov rax,rdx
shr rax,4
shr rax,1
xor r8d,r8d
test rax,rax
jbe short M01_L04
M01_L03:
mov r9,r8
shl r9,4
inc r8
mov r10,r8
shl r10,4
mov r11,rdx
sub r11,r10
vmovdqu xmm1,xmmword ptr [rcx+r9]
vmovdqu xmm2,xmmword ptr [rcx+r11]
vpshufb xmm1,xmm1,xmm0
vpshufb xmm2,xmm2,xmm0
vmovdqu xmmword ptr [rcx+r9],xmm2
vmovdqu xmmword ptr [rcx+r11],xmm1
cmp r8,rax
jb short M01_L03
M01_L04:
shl rax,4
add rcx,rax
add rax,rax
sub rdx,rax
M01_L05:
xor eax,eax
mov r8,rdx
shr r8,1
je short M01_L07
dec rdx
M01_L06:
lea r9,[rcx+rax]
mov r10,rdx
sub r10,rax
add r10,rcx
movzx r11d,byte ptr [r9]
movzx esi,byte ptr [r10]
mov [r10],r11b
mov [r9],sil
inc rax
cmp r8,rax
ja short M01_L06
M01_L07:
vzeroupper
pop rsi
ret
; Total bytes of code 285 After: ; System.SpanHelpers.Reverse(Byte ByRef, UIntPtr)
vzeroupper
cmp rdx,40
jb short M01_L01
vmovupd ymm0,[7FFC840BCE20]
movsxd rax,edx
lea rax,[rcx+rax+0FFE0]
nop word ptr [rax+rax]
M01_L00:
vmovdqu ymm1,ymmword ptr [rcx]
vmovdqu ymm2,ymmword ptr [rax]
vpshufb ymm1,ymm1,ymm0
vperm2i128 ymm1,ymm1,ymm1,1
vpshufb ymm2,ymm2,ymm0
vperm2i128 ymm2,ymm2,ymm2,1
vmovdqu ymmword ptr [rcx],ymm2
vmovdqu ymmword ptr [rax],ymm1
add rcx,20
add rax,0FFFFFFFFFFFFFFE0
add rdx,0FFFFFFFFFFFFFFC0
cmp rdx,40
jae short M01_L00
M01_L01:
cmp rdx,10
jb short M01_L03
movsxd rax,edx
lea rax,[rcx+rax+0FFF8]
M01_L02:
mov r8,[rcx]
mov r9,[rax]
movbe [rcx],r9
movbe [rax],r8
add rcx,8
add rax,0FFFFFFFFFFFFFFF8
add rdx,0FFFFFFFFFFFFFFF0
cmp rdx,10
jae short M01_L02
M01_L03:
cmp rdx,8
jb short M01_L05
movsxd rax,edx
lea rax,[rcx+rax+0FFFC]
M01_L04:
mov r8d,[rcx]
mov r9d,[rax]
movbe [rcx],r9d
movbe [rax],r8d
add rcx,4
add rax,0FFFFFFFFFFFFFFFC
add rdx,0FFFFFFFFFFFFFFF8
cmp rdx,8
jae short M01_L04
M01_L05:
movsxd rax,edx
lea rax,[rcx+rax+0FFFF]
cmp rcx,rax
jae short M01_L07
M01_L06:
movzx edx,byte ptr [rcx]
movzx r8d,byte ptr [rax]
mov [rax],dl
mov [rcx],r8b
inc rcx
dec rax
cmp rcx,rax
jb short M01_L06
M01_L07:
vzeroupper
ret
; Total bytes of code 224
Benchmark Code[DisassemblyDiagnoser(printSource: true)]
[GenericTypeArguments(typeof(byte))]
[GenericTypeArguments(typeof(char))]
[GenericTypeArguments(typeof(int))]
[GenericTypeArguments(typeof(long))]
public class ReverseTests<T>
{
[Params(4, 8, 19, 128, 255, 1024)]
public int NumberOfValues { get; set; }
private T[] _values;
[GlobalSetup]
public void Setup()
{
// taken from dotnet/performance
_values = ValuesGenerator.Array<T>(NumberOfValues);
}
[Benchmark]
public void Reverse() => _values.AsSpan().Reverse();
} I also ran the dotnet/performance ResultsComparer with DOTNET_ENABLEAVX2=0 DOTNET_ENABLESSSE3=0 DOTNET_ENABLESSE2=0 and got these results:
This should help fix the regression for ARM64 referenced in #68667
|
|
# Conflicts: # src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs # src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs # src/libraries/System.Private.CoreLib/src/System/SpanHelpers.cs
Thanks, I didn't see there was already a PR open for this when I posted. Fortunately the merging was trivial |
Hi again @adamsitnik. I have updated my initial post benchmark numbers since after #70650 was merged, sorry it took so long. I feel less confident in the necessity from a performance perspective since the regression has already been resolved. There's still a different look of the code, please let me know what you prefer. |
# Conflicts: # src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs # src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs # src/libraries/System.Private.CoreLib/src/System/SpanHelpers.cs
src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs
Outdated
Show resolved
Hide resolved
What's the reason its stalled? Is there anything you need from me? |
@yesmey please excuse me for the delay. I am going to run benchmarks on arm64 and x64 machines today and if the results look good, I am going to review the PR. |
src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs
Outdated
Show resolved
Hide resolved
src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs
Outdated
Show resolved
Hide resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Overall, the changes look good, but we need to address one arm64 regression and it would be great to try my suggestions to see if we can improve perf any further.
@yesmey big thanks for your contribution!
arm64 results
BenchmarkDotNet=v0.13.2.1950-nightly, OS=ubuntu 20.04
Unknown processor
.NET SDK=8.0.100-alpha.1.22565.7
[Host] : .NET 8.0.0 (8.0.22.55902), Arm64 RyuJIT AdvSIMD
LaunchCount=6 MemoryRandomization=True
Type | Method | Job | Size | Mean | Ratio |
---|---|---|---|---|---|
Span<Byte> | Reverse | PR | 4 | 6.863 ns | 0.68 |
Span<Byte> | Reverse | main | 4 | 10.123 ns | 1.00 |
Span<Char> | Reverse | PR | 4 | 5.679 ns | 1.06 |
Span<Char> | Reverse | main | 4 | 5.303 ns | 1.00 |
Span<Int32> | Reverse | PR | 4 | 5.470 ns | 1.06 |
Span<Int32> | Reverse | main | 4 | 5.189 ns | 1.00 |
Span<Int64> | Reverse | PR | 4 | 5.897 ns | 0.89 |
Span<Int64> | Reverse | main | 4 | 6.694 ns | 1.00 |
Span<Byte> | Reverse | PR | 8 | 8.927 ns | 0.84 |
Span<Byte> | Reverse | main | 8 | 10.579 ns | 1.00 |
Span<Char> | Reverse | PR | 8 | 5.267 ns | 0.77 |
Span<Char> | Reverse | main | 8 | 6.824 ns | 1.00 |
Span<Int32> | Reverse | PR | 8 | 5.479 ns | 0.89 |
Span<Int32> | Reverse | main | 8 | 6.199 ns | 1.00 |
Span<Int64> | Reverse | PR | 8 | 7.017 ns | 0.82 |
Span<Int64> | Reverse | main | 8 | 8.530 ns | 1.00 |
Span<Byte> | Reverse | PR | 16 | 8.793 ns | 0.70 |
Span<Byte> | Reverse | main | 16 | 12.689 ns | 1.00 |
Span<Char> | Reverse | PR | 16 | 5.275 ns | 0.84 |
Span<Char> | Reverse | main | 16 | 6.331 ns | 1.00 |
Span<Int32> | Reverse | PR | 16 | 6.653 ns | 0.77 |
Span<Int32> | Reverse | main | 16 | 8.730 ns | 1.00 |
Span<Int64> | Reverse | PR | 16 | 9.895 ns | 0.75 |
Span<Int64> | Reverse | main | 16 | 13.121 ns | 1.00 |
Span<Byte> | Reverse | PR | 32 | 7.317 ns | 0.74 |
Span<Byte> | Reverse | main | 32 | 9.620 ns | 1.00 |
Span<Char> | Reverse | PR | 32 | 6.383 ns | 0.73 |
Span<Char> | Reverse | main | 32 | 8.818 ns | 1.00 |
Span<Int32> | Reverse | PR | 32 | 10.073 ns | 0.77 |
Span<Int32> | Reverse | main | 32 | 13.074 ns | 1.00 |
Span<Int64> | Reverse | PR | 32 | 16.117 ns | 0.72 |
Span<Int64> | Reverse | main | 32 | 22.447 ns | 1.00 |
Span<Byte> | Reverse | PR | 64 | 8.620 ns | 0.90 |
Span<Byte> | Reverse | main | 64 | 15.146 ns | 1.00 |
Span<Char> | Reverse | PR | 64 | 9.349 ns | 0.73 |
Span<Char> | Reverse | main | 64 | 12.725 ns | 1.00 |
Span<Int32> | Reverse | PR | 64 | 16.312 ns | 0.73 |
Span<Int32> | Reverse | main | 64 | 22.501 ns | 1.00 |
Span<Int64> | Reverse | PR | 64 | 28.723 ns | 0.70 |
Span<Int64> | Reverse | main | 64 | 40.926 ns | 1.00 |
Span<Byte> | Reverse | PR | 512 | 41.769 ns | 1.13 |
Span<Byte> | Reverse | main | 512 | 37.129 ns | 1.00 |
Span<Char> | Reverse | PR | 512 | 52.922 ns | 0.64 |
Span<Char> | Reverse | main | 512 | 85.611 ns | 1.00 |
Span<Int32> | Reverse | PR | 512 | 101.891 ns | 0.68 |
Span<Int32> | Reverse | main | 512 | 150.510 ns | 1.00 |
Span<Int64> | Reverse | PR | 512 | 204.654 ns | 0.67 |
Span<Int64> | Reverse | main | 512 | 305.020 ns | 1.00 |
x64 results (AMD)
BenchmarkDotNet=v0.13.2.1950-nightly, OS=Windows 11 (10.0.22621.819)
AMD Ryzen Threadripper PRO 3945WX 12-Cores, 1 CPU, 24 logical and 12 physical cores
.NET SDK=8.0.100-alpha.1.22565.7
[Host] : .NET 8.0.0 (8.0.22.55902), X64 RyuJIT AVX2
LaunchCount=9 MemoryRandomization=True
Type | Method | Job | Size | Mean | Ratio |
---|---|---|---|---|---|
Span<Byte> | Reverse | PR | 4 | 3.525 ns | 0.89 |
Span<Byte> | Reverse | main | 4 | 3.954 ns | 1.00 |
Span<Char> | Reverse | PR | 4 | 4.239 ns | 0.97 |
Span<Char> | Reverse | main | 4 | 4.377 ns | 1.00 |
Span<Int32> | Reverse | PR | 4 | 3.626 ns | 0.95 |
Span<Int32> | Reverse | main | 4 | 3.806 ns | 1.00 |
Span<Int64> | Reverse | PR | 4 | 3.774 ns | 0.94 |
Span<Int64> | Reverse | main | 4 | 4.024 ns | 1.00 |
Span<Byte> | Reverse | PR | 8 | 3.453 ns | 0.65 |
Span<Byte> | Reverse | main | 8 | 5.293 ns | 1.00 |
Span<Char> | Reverse | PR | 8 | 3.981 ns | 0.68 |
Span<Char> | Reverse | main | 8 | 5.907 ns | 1.00 |
Span<Int32> | Reverse | PR | 8 | 3.824 ns | 1.03 |
Span<Int32> | Reverse | main | 8 | 3.715 ns | 1.00 |
Span<Int64> | Reverse | PR | 8 | 3.792 ns | 0.89 |
Span<Int64> | Reverse | main | 8 | 4.254 ns | 1.00 |
Span<Byte> | Reverse | PR | 16 | 3.477 ns | 0.34 |
Span<Byte> | Reverse | main | 16 | 10.383 ns | 1.00 |
Span<Char> | Reverse | PR | 16 | 4.242 ns | 0.89 |
Span<Char> | Reverse | main | 16 | 4.771 ns | 1.00 |
Span<Int32> | Reverse | PR | 16 | 3.825 ns | 0.98 |
Span<Int32> | Reverse | main | 16 | 3.896 ns | 1.00 |
Span<Int64> | Reverse | PR | 16 | 4.733 ns | 0.91 |
Span<Int64> | Reverse | main | 16 | 5.216 ns | 1.00 |
Span<Byte> | Reverse | PR | 32 | 3.625 ns | 0.89 |
Span<Byte> | Reverse | main | 32 | 4.064 ns | 1.00 |
Span<Char> | Reverse | PR | 32 | 4.250 ns | 0.83 |
Span<Char> | Reverse | main | 32 | 5.137 ns | 1.00 |
Span<Int32> | Reverse | PR | 32 | 4.531 ns | 0.94 |
Span<Int32> | Reverse | main | 32 | 4.822 ns | 1.00 |
Span<Int64> | Reverse | PR | 32 | 5.958 ns | 0.87 |
Span<Int64> | Reverse | main | 32 | 6.849 ns | 1.00 |
Span<Byte> | Reverse | PR | 64 | 3.603 ns | 0.80 |
Span<Byte> | Reverse | main | 64 | 4.504 ns | 1.00 |
Span<Char> | Reverse | PR | 64 | 5.089 ns | 0.84 |
Span<Char> | Reverse | main | 64 | 6.062 ns | 1.00 |
Span<Int32> | Reverse | PR | 64 | 6.270 ns | 0.94 |
Span<Int32> | Reverse | main | 64 | 6.659 ns | 1.00 |
Span<Int64> | Reverse | PR | 64 | 8.075 ns | 0.82 |
Span<Int64> | Reverse | main | 64 | 9.849 ns | 1.00 |
Span<Byte> | Reverse | PR | 512 | 9.255 ns | 0.83 |
Span<Byte> | Reverse | main | 512 | 11.180 ns | 1.00 |
Span<Char> | Reverse | PR | 512 | 15.892 ns | 0.86 |
Span<Char> | Reverse | main | 512 | 18.502 ns | 1.00 |
Span<Int32> | Reverse | PR | 512 | 29.720 ns | 1.00 |
Span<Int32> | Reverse | main | 512 | 29.701 ns | 1.00 |
Span<Int64> | Reverse | PR | 512 | 62.906 ns | 1.00 |
Span<Int64> | Reverse | main | 512 | 63.263 ns | 1.00 |
tempFirst.StoreUnsafe(ref last); | ||
|
||
first = ref Unsafe.Add(ref first, Vector256<byte>.Count); | ||
last = ref Unsafe.Subtract(ref last, Vector256<byte>.Count); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Initially I thought this can lead to a GC hole similar to #75792 (comment) but since we modify both first
and last
in every loop iteration it looks like last
will never go before buf
.
@tannergooding @EgorBo Is my understanding correct?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The overall handling of mutating byrefs
is potentially buggy and error prone. It's really easy to accidentally do the wrong thing, especially over the course of several refactorings.
I think it's ok/better to leave some small perf on the table in order to keep the code more readable/maintainable and less error prone.
CC. @jkotas
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In this particular case, it would be good to look at where the codegen deficiencies between the two approaches are.
We could likely declare firstOffset
/lastOffset
outside the loop and do simple increments to improve codegen a bit, for example.
We should also validate that numElements
is being treated as a constant by the JIT. If not, we should inline the usages of Vector256<byte>.Count
or use the existing internal constant Vector256.Size
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'll take a look at this on Friday. Should be a simple fix to use offsets instead
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changed to offsets, is it better now?
} | ||
else if (Vector128.IsHardwareAccelerated && (nuint)Vector128<byte>.Count * 2 <= length) | ||
else if (remainder >= sizeof(long)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In case there is buffer containing 63 bytes, the vectorized AVX2 loop is going to perform one iteration, then the remainder
will be equal 31
and we are going to call ReverseInner
below. How about calling the ReverseEndianness
-based loop then? So following code would be executed:
- One iteration of AVX2 loop (remainder == 31)
- Three iterations of
ReverseEndianness(long)
loop (remainder == 7) - Call to
ReverseInner(7)
or perhaps one iteration ofReverseEndianness(int)
and then a call toReverseInner(3)
.
Could you please give it a try and see how it affects performance?
else if (remainder >= sizeof(long)) | |
if (remainder >= sizeof(long)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good, I'll take a look again on Friday
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In your example the 63 range is indeed covered by the AVX2 loop in one iteration, but since they overlapped the remainder to ends up being -1 :)
However, for example the range between 68-96 would fall-through, so it is still worthwhile. Thanks
// Load in values from beginning and end of the array. | ||
Vector128<short> tempFirst = Vector128.LoadUnsafe(ref bufShort, firstOffset); | ||
Vector128<short> tempLast = Vector128.LoadUnsafe(ref bufShort, lastOffset); | ||
Vector128<ushort> tempFirst = Vector128.LoadUnsafe(ref Unsafe.As<char, ushort>(ref first)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this method accepts a ref char
argument, but it uses ushort
in most of the places. Since it's being called from one place, we could modify it to accept a ref ushort
, change the calling code and remove the char -> ushort
"casts":
runtime/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.cs
Lines 556 to 558 in 0f412b1
else if (Unsafe.SizeOf<T>() == sizeof(char)) | |
{ | |
Reverse(ref Unsafe.As<T, char>(ref elements), length); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As I'm sure u know - the casts themselves doesn't have any impact on the generated code, so we're dealing with just a design choice. I think that it makes more sense, and is more consistent, to keep the ref char
input since we are in the SpanHelper.Char context, and the caller would naturally want to pass a char
} | ||
else if (Vector128.IsHardwareAccelerated && (nuint)Vector128<byte>.Count * 2 <= length) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the removal of Vector128
code path causes a 10% regression for larger inputs on arm64. This needs to be addressed. I can run the benchmarks for you on an arm64 machine, just ping me here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I changed it back to the previous behavior, can you run the benchmarks again when you got time, please?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The changes look good to me, I've also verified the performance on arm64, x64 AVX and x64 AVX2. Big thanks for your contribution @yesmey !
In the details you can find the results. I use word "before" to refer to the state of your PR before addressing most recent feedback. "after" is current state.
arm64
BenchmarkDotNet=v0.13.2.1950-nightly, OS=ubuntu 20.04
Unknown processor
.NET SDK=8.0.100-alpha.1.22567.28
[Host] : .NET 8.0.0 (8.0.22.55902), Arm64 RyuJIT AdvSIMD
LaunchCount=3 MemoryRandomization=True
Type | Job | Size | Mean | Ratio |
---|---|---|---|---|
Span<Byte> | after | 4 | 6.698 ns | 0.68 |
Span<Byte> | before | 4 | 6.682 ns | 0.68 |
Span<Byte> | main | 4 | 9.825 ns | 1.00 |
Span<Char> | after | 4 | 5.529 ns | 1.03 |
Span<Char> | before | 4 | 6.127 ns | 1.11 |
Span<Char> | main | 4 | 5.344 ns | 1.00 |
Span<Int32> | after | 4 | 5.488 ns | 1.07 |
Span<Int32> | before | 4 | 5.899 ns | 1.16 |
Span<Int32> | main | 4 | 5.112 ns | 1.00 |
Span<Int64> | after | 4 | 5.589 ns | 0.78 |
Span<Int64> | before | 4 | 5.638 ns | 0.79 |
Span<Int64> | main | 4 | 7.176 ns | 1.00 |
Span<Byte> | after | 8 | 7.064 ns | 0.67 |
Span<Byte> | before | 8 | 8.591 ns | 0.82 |
Span<Byte> | main | 8 | 10.477 ns | 1.00 |
Span<Char> | after | 8 | 5.710 ns | 0.82 |
Span<Char> | before | 8 | 5.254 ns | 0.76 |
Span<Char> | main | 8 | 6.935 ns | 1.00 |
Span<Int32> | after | 8 | 5.433 ns | 0.90 |
Span<Int32> | before | 8 | 5.472 ns | 0.90 |
Span<Int32> | main | 8 | 6.054 ns | 1.00 |
Span<Int64> | after | 8 | 7.269 ns | 0.85 |
Span<Int64> | before | 8 | 7.194 ns | 0.84 |
Span<Int64> | main | 8 | 8.546 ns | 1.00 |
Span<Byte> | after | 16 | 7.062 ns | 0.54 |
Span<Byte> | before | 16 | 9.330 ns | 0.72 |
Span<Byte> | main | 16 | 13.085 ns | 1.00 |
Span<Char> | after | 16 | 5.631 ns | 0.92 |
Span<Char> | before | 16 | 5.345 ns | 0.88 |
Span<Char> | main | 16 | 6.088 ns | 1.00 |
Span<Int32> | after | 16 | 7.168 ns | 0.88 |
Span<Int32> | before | 16 | 6.648 ns | 0.81 |
Span<Int32> | main | 16 | 8.188 ns | 1.00 |
Span<Int64> | after | 16 | 11.445 ns | 0.87 |
Span<Int64> | before | 16 | 9.944 ns | 0.75 |
Span<Int64> | main | 16 | 13.173 ns | 1.00 |
Span<Byte> | after | 32 | 6.949 ns | 0.73 |
Span<Byte> | before | 32 | 7.262 ns | 0.77 |
Span<Byte> | main | 32 | 9.500 ns | 1.00 |
Span<Char> | after | 32 | 7.122 ns | 0.83 |
Span<Char> | before | 32 | 6.390 ns | 0.74 |
Span<Char> | main | 32 | 8.642 ns | 1.00 |
Span<Int32> | after | 32 | 11.416 ns | 0.84 |
Span<Int32> | before | 32 | 9.863 ns | 0.72 |
Span<Int32> | main | 32 | 13.628 ns | 1.00 |
Span<Int64> | after | 32 | 19.991 ns | 0.89 |
Span<Int64> | before | 32 | 16.149 ns | 0.72 |
Span<Int64> | main | 32 | 22.357 ns | 1.00 |
Span<Byte> | after | 64 | 9.200 ns | 1.02 |
Span<Byte> | before | 64 | 9.338 ns | 1.05 |
Span<Byte> | main | 64 | 9.036 ns | 1.00 |
Span<Char> | after | 64 | 11.689 ns | 0.92 |
Span<Char> | before | 64 | 10.170 ns | 0.80 |
Span<Char> | main | 64 | 12.748 ns | 1.00 |
Span<Int32> | after | 64 | 20.204 ns | 0.92 |
Span<Int32> | before | 64 | 16.103 ns | 0.73 |
Span<Int32> | main | 64 | 22.023 ns | 1.00 |
Span<Int64> | after | 64 | 36.913 ns | 0.90 |
Span<Int64> | before | 64 | 28.603 ns | 0.70 |
Span<Int64> | main | 64 | 40.904 ns | 1.00 |
Span<Byte> | after | 512 | 34.769 ns | 0.94 |
Span<Byte> | before | 512 | 41.475 ns | 1.12 |
Span<Byte> | main | 512 | 36.971 ns | 1.00 |
Span<Char> | after | 512 | 70.532 ns | 0.91 |
Span<Char> | before | 512 | 52.890 ns | 0.68 |
Span<Char> | main | 512 | 77.595 ns | 1.00 |
Span<Int32> | after | 512 | 138.098 ns | 0.93 |
Span<Int32> | before | 512 | 102.976 ns | 0.69 |
Span<Int32> | main | 512 | 149.035 ns | 1.00 |
Span<Int64> | after | 512 | 277.912 ns | 0.91 |
Span<Int64> | before | 512 | 205.370 ns | 0.67 |
Span<Int64> | main | 512 | 305.281 ns | 1.00 |
x64 AVX2
BenchmarkDotNet=v0.13.2.1950-nightly, OS=Windows 11 (10.0.22621.819)
AMD Ryzen Threadripper PRO 3945WX 12-Cores, 1 CPU, 24 logical and 12 physical cores
.NET SDK=8.0.100-alpha.1.22567.28
[Host] : .NET 8.0.0 (8.0.22.55902), X64 RyuJIT AVX2
all-jobs : .NET 8.0.0 (42.42.42.42424), X64 RyuJIT AVX2
LaunchCount=3 MemoryRandomization=True
Type | Job | Size | Ratio |
---|---|---|---|
Span<Byte> | after | 4 | 0.90 |
Span<Byte> | before | 4 | 0.89 |
Span<Byte> | main | 4 | 1.00 |
Span<Char> | after | 4 | 0.94 |
Span<Char> | before | 4 | 0.95 |
Span<Char> | main | 4 | 1.00 |
Span<Int32> | after | 4 | 0.91 |
Span<Int32> | before | 4 | 0.95 |
Span<Int32> | main | 4 | 1.00 |
Span<Int64> | after | 4 | 0.90 |
Span<Int64> | before | 4 | 0.94 |
Span<Int64> | main | 4 | 1.00 |
Span<Byte> | after | 8 | 0.67 |
Span<Byte> | before | 8 | 0.65 |
Span<Byte> | main | 8 | 1.00 |
Span<Char> | after | 8 | 0.68 |
Span<Char> | before | 8 | 0.67 |
Span<Char> | main | 8 | 1.00 |
Span<Int32> | after | 8 | 0.96 |
Span<Int32> | before | 8 | 1.03 |
Span<Int32> | main | 8 | 1.00 |
Span<Int64> | after | 8 | 0.83 |
Span<Int64> | before | 8 | 0.89 |
Span<Int64> | main | 8 | 1.00 |
Span<Byte> | after | 16 | 0.35 |
Span<Byte> | before | 16 | 0.34 |
Span<Byte> | main | 16 | 1.00 |
Span<Char> | after | 16 | 0.86 |
Span<Char> | before | 16 | 0.90 |
Span<Char> | main | 16 | 1.00 |
Span<Int32> | after | 16 | 0.91 |
Span<Int32> | before | 16 | 0.99 |
Span<Int32> | main | 16 | 1.00 |
Span<Int64> | after | 16 | 0.82 |
Span<Int64> | before | 16 | 0.90 |
Span<Int64> | main | 16 | 1.00 |
Span<Byte> | after | 32 | 1.01 |
Span<Byte> | before | 32 | 0.89 |
Span<Byte> | main | 32 | 1.00 |
Span<Char> | after | 32 | 0.82 |
Span<Char> | before | 32 | 0.82 |
Span<Char> | main | 32 | 1.00 |
Span<Int32> | after | 32 | 0.85 |
Span<Int32> | before | 32 | 0.95 |
Span<Int32> | main | 32 | 1.00 |
Span<Int64> | after | 32 | 0.83 |
Span<Int64> | before | 32 | 0.86 |
Span<Int64> | main | 32 | 1.00 |
Span<Byte> | after | 64 | 0.91 |
Span<Byte> | before | 64 | 0.82 |
Span<Byte> | main | 64 | 1.00 |
Span<Char> | after | 64 | 0.88 |
Span<Char> | before | 64 | 0.84 |
Span<Char> | main | 64 | 1.00 |
Span<Int32> | after | 64 | 0.88 |
Span<Int32> | before | 64 | 0.94 |
Span<Int32> | main | 64 | 1.00 |
Span<Int64> | after | 64 | 0.79 |
Span<Int64> | before | 64 | 0.82 |
Span<Int64> | main | 64 | 1.00 |
Span<Byte> | after | 512 | 0.89 |
Span<Byte> | before | 512 | 0.82 |
Span<Byte> | main | 512 | 1.00 |
Span<Char> | after | 512 | 0.90 |
Span<Char> | before | 512 | 0.84 |
Span<Char> | main | 512 | 1.00 |
Span<Int32> | after | 512 | 0.99 |
Span<Int32> | before | 512 | 0.98 |
Span<Int32> | main | 512 | 1.00 |
Span<Int64> | after | 512 | 1.00 |
Span<Int64> | before | 512 | 0.99 |
Span<Int64> | main | 512 | 1.00 |
x64 AVX
BenchmarkDotNet=v0.13.2.1950-nightly, OS=Windows 11 (10.0.22621.819)
AMD Ryzen Threadripper PRO 3945WX 12-Cores, 1 CPU, 24 logical and 12 physical cores
.NET SDK=8.0.100-alpha.1.22567.28
[Host] : .NET 8.0.0 (8.0.22.55902), X64 RyuJIT AVX2
all jobs : .NET 8.0.0 (42.42.42.42424), X64 RyuJIT AVX
EnvironmentVariables=COMPlus_EnableAVX2=0 MemoryRandomization=True
Type | Job | Size | Mean | Ratio |
---|---|---|---|---|
Span<Byte> | after | 4 | 3.549 ns | 0.90 |
Span<Byte> | before | 4 | 3.504 ns | 0.89 |
Span<Byte> | main | 4 | 3.926 ns | 1.00 |
Span<Char> | after | 4 | 3.860 ns | 0.99 |
Span<Char> | before | 4 | 3.934 ns | 1.01 |
Span<Char> | main | 4 | 3.885 ns | 1.00 |
Span<Int32> | after | 4 | 3.531 ns | 0.94 |
Span<Int32> | before | 4 | 3.570 ns | 0.95 |
Span<Int32> | main | 4 | 3.754 ns | 1.00 |
Span<Int64> | after | 4 | 3.125 ns | 0.88 |
Span<Int64> | before | 4 | 3.556 ns | 1.00 |
Span<Int64> | main | 4 | 3.555 ns | 1.00 |
Span<Byte> | after | 8 | 3.507 ns | 0.69 |
Span<Byte> | before | 8 | 3.448 ns | 0.68 |
Span<Byte> | main | 8 | 5.072 ns | 1.00 |
Span<Char> | after | 8 | 3.519 ns | 0.65 |
Span<Char> | before | 8 | 3.574 ns | 0.66 |
Span<Char> | main | 8 | 5.500 ns | 1.00 |
Span<Int32> | after | 8 | 3.599 ns | 0.97 |
Span<Int32> | before | 8 | 3.555 ns | 0.96 |
Span<Int32> | main | 8 | 3.698 ns | 1.00 |
Span<Int64> | after | 8 | 4.038 ns | 0.95 |
Span<Int64> | before | 8 | 4.297 ns | 1.01 |
Span<Int64> | main | 8 | 4.265 ns | 1.00 |
Span<Byte> | after | 16 | 3.771 ns | 0.37 |
Span<Byte> | before | 16 | 3.435 ns | 0.33 |
Span<Byte> | main | 16 | 10.378 ns | 1.00 |
Span<Char> | after | 16 | 3.512 ns | 0.93 |
Span<Char> | before | 16 | 3.548 ns | 0.94 |
Span<Char> | main | 16 | 3.784 ns | 1.00 |
Span<Int32> | after | 16 | 4.015 ns | 0.95 |
Span<Int32> | before | 16 | 4.085 ns | 0.96 |
Span<Int32> | main | 16 | 4.236 ns | 1.00 |
Span<Int64> | after | 16 | 5.458 ns | 0.95 |
Span<Int64> | before | 16 | 5.468 ns | 0.95 |
Span<Int64> | main | 16 | 5.740 ns | 1.00 |
Span<Byte> | after | 32 | 3.775 ns | 0.93 |
Span<Byte> | before | 32 | 3.740 ns | 0.93 |
Span<Byte> | main | 32 | 4.040 ns | 1.00 |
Span<Char> | after | 32 | 4.326 ns | 0.91 |
Span<Char> | before | 32 | 4.269 ns | 0.90 |
Span<Char> | main | 32 | 4.760 ns | 1.00 |
Span<Int32> | after | 32 | 5.715 ns | 0.91 |
Span<Int32> | before | 32 | 5.853 ns | 0.93 |
Span<Int32> | main | 32 | 6.266 ns | 1.00 |
Span<Int64> | after | 32 | 8.385 ns | 1.04 |
Span<Int64> | before | 32 | 8.146 ns | 1.02 |
Span<Int64> | main | 32 | 8.065 ns | 1.00 |
Span<Byte> | after | 64 | 4.527 ns | 0.87 |
Span<Byte> | before | 64 | 5.103 ns | 0.98 |
Span<Byte> | main | 64 | 5.200 ns | 1.00 |
Span<Char> | after | 64 | 7.003 ns | 1.04 |
Span<Char> | before | 64 | 5.867 ns | 0.87 |
Span<Char> | main | 64 | 6.712 ns | 1.00 |
Span<Int32> | after | 64 | 8.624 ns | 0.94 |
Span<Int32> | before | 64 | 8.693 ns | 0.94 |
Span<Int32> | main | 64 | 9.222 ns | 1.00 |
Span<Int64> | after | 64 | 11.523 ns | 0.90 |
Span<Int64> | before | 64 | 11.552 ns | 0.90 |
Span<Int64> | main | 64 | 12.817 ns | 1.00 |
Span<Byte> | after | 512 | 14.307 ns | 0.93 |
Span<Byte> | before | 512 | 18.963 ns | 1.24 |
Span<Byte> | main | 512 | 15.323 ns | 1.00 |
Span<Char> | after | 512 | 26.676 ns | 1.06 |
Span<Char> | before | 512 | 21.792 ns | 0.87 |
Span<Char> | main | 512 | 25.158 ns | 1.00 |
Span<Int32> | after | 512 | 41.424 ns | 0.94 |
Span<Int32> | before | 512 | 41.353 ns | 0.94 |
Span<Int32> | main | 512 | 43.920 ns | 1.00 |
Span<Int64> | after | 512 | 81.225 ns | 0.91 |
Span<Int64> | before | 512 | 78.500 ns | 0.89 |
Span<Int64> | main | 512 | 89.035 ns | 1.00 |
} | ||
|
||
// Store any remaining values one-by-one | ||
if (remainder > 1) | ||
{ | ||
ReverseInner(ref first, (nuint)remainder); | ||
ReverseInner(ref Unsafe.Add(ref buf, offset), (nuint)remainder); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The refactor improves the codegen:
Main, X64 RyuJIT AVX2
; System.SpanHelpers.Reverse(Int32 ByRef, UIntPtr)
vzeroupper
cmp rdx,10
jb short M00_L02
nop dword ptr [rax]
mov rax,rdx
shr rax,3
shr rax,1
vmovupd ymm0,[7FFEC5FDFFE0]
xor r8d,r8d
test rax,rax
jbe short M00_L01
nop dword ptr [rax+rax]
M00_L00:
lea r9,[r8*8]
inc r8
lea r10,[r8*8]
mov r11,rdx
sub r11,r10
vmovdqu ymm1,ymmword ptr [rcx+r9*4]
vmovdqu ymm2,ymmword ptr [rcx+r11*4]
vpermd ymm1,ymm0,ymm1
vpermd ymm2,ymm0,ymm2
vmovdqu ymmword ptr [rcx+r9*4],ymm2
vmovdqu ymmword ptr [rcx+r11*4],ymm1
cmp r8,rax
jb short M00_L00
M00_L01:
shl rax,3
lea rcx,[rcx+rax*4]
add rax,rax
sub rdx,rax
jmp short M00_L05
M00_L02:
cmp rdx,8
jb short M00_L05
mov rax,rdx
shr rax,2
shr rax,1
xor r8d,r8d
test rax,rax
jbe short M00_L04
M00_L03:
lea r9,[r8*4]
inc r8
lea r10,[r8*4]
mov r11,rdx
sub r11,r10
vmovdqu xmm0,xmmword ptr [rcx+r9*4]
vmovdqu xmm1,xmmword ptr [rcx+r11*4]
vpshufd xmm0,xmm0,1B
vpshufd xmm1,xmm1,1B
vmovdqu xmmword ptr [rcx+r9*4],xmm1
vmovdqu xmmword ptr [rcx+r11*4],xmm0
cmp r8,rax
jb short M00_L03
M00_L04:
shl rax,2
lea rcx,[rcx+rax*4]
add rax,rax
sub rdx,rax
M00_L05:
cmp rdx,1
jbe short M00_L07
movsxd rax,edx
lea rax,[rcx+rax*4-4]
nop word ptr [rax+rax]
M00_L06:
mov edx,[rcx]
mov r8d,[rax]
mov [rcx],r8d
mov [rax],edx
add rcx,4
add rax,0FFFFFFFFFFFFFFFC
cmp rcx,rax
jb short M00_L06
M00_L07:
vzeroupper
ret
; Total bytes of code 283
Before, X64 RyuJIT AVX2
; System.SpanHelpers.Reverse(Int32 ByRef, UIntPtr)
vzeroupper
lea rax,[rcx+rdx*4]
cmp rdx,8
jb short M00_L01
add rax,0FFFFFFFFFFFFFFE0
vmovupd ymm0,[7FFEC68C0500]
nop word ptr [rax+rax]
M00_L00:
vmovdqu ymm1,ymmword ptr [rcx]
vmovdqu ymm2,ymmword ptr [rax]
vpermd ymm1,ymm0,ymm1
vpermd ymm2,ymm0,ymm2
vmovdqu ymmword ptr [rcx],ymm2
vmovdqu ymmword ptr [rax],ymm1
add rcx,20
add rax,0FFFFFFFFFFFFFFE0
cmp rcx,rax
jb short M00_L00
lea rdx,[rax+20]
sub rdx,rcx
mov rax,rdx
sar rax,3F
and rax,3
add rdx,rax
sar rdx,2
jmp short M00_L03
M00_L01:
cmp rdx,4
jb short M00_L03
add rax,0FFFFFFFFFFFFFFF0
M00_L02:
vmovdqu xmm0,xmmword ptr [rcx]
vmovdqu xmm1,xmmword ptr [rax]
vpshufd xmm0,xmm0,1B
vpshufd xmm1,xmm1,1B
vmovdqu xmmword ptr [rcx],xmm1
vmovdqu xmmword ptr [rax],xmm0
add rcx,10
add rax,0FFFFFFFFFFFFFFF0
cmp rcx,rax
jb short M00_L02
lea rdx,[rax+10]
sub rdx,rcx
mov rax,rdx
sar rax,3F
and rax,3
add rdx,rax
sar rdx,2
M00_L03:
cmp rdx,1
jle short M00_L05
mov rax,rcx
lea rdx,[rax+rdx*4-4]
M00_L04:
mov ecx,[rax]
mov r8d,[rdx]
mov [rax],r8d
mov [rdx],ecx
add rax,4
add rdx,0FFFFFFFFFFFFFFFC
cmp rax,rdx
jb short M00_L04
M00_L05:
vzeroupper
ret
; Total bytes of code 213
After, X64 RyuJIT AVX2
; System.SpanHelpers.Reverse(Int32 ByRef, UIntPtr)
vzeroupper
xor eax,eax
cmp rdx,8
jl short M00_L01
add rdx,0FFFFFFFFFFFFFFF8
vmovupd ymm0,[7FFEC6512680]
M00_L00:
vmovdqu ymm1,ymmword ptr [rcx+rax*4]
vmovdqu ymm2,ymmword ptr [rcx+rdx*4]
vpermd ymm1,ymm0,ymm1
vpermd ymm2,ymm0,ymm2
vmovdqu ymmword ptr [rcx+rax*4],ymm2
vmovdqu ymmword ptr [rcx+rdx*4],ymm1
add rax,8
add rdx,0FFFFFFFFFFFFFFF8
cmp rdx,rax
jge short M00_L00
add rdx,8
sub rdx,rax
jmp short M00_L03
nop word ptr [rax+rax]
M00_L01:
cmp rdx,4
jl short M00_L03
add rdx,0FFFFFFFFFFFFFFFC
M00_L02:
vmovdqu xmm0,xmmword ptr [rcx+rax*4]
vmovdqu xmm1,xmmword ptr [rcx+rdx*4]
vpshufd xmm0,xmm0,1B
vpshufd xmm1,xmm1,1B
vmovdqu xmmword ptr [rcx+rax*4],xmm1
vmovdqu xmmword ptr [rcx+rdx*4],xmm0
add rax,4
add rdx,0FFFFFFFFFFFFFFFC
cmp rdx,rax
jge short M00_L02
add rdx,4
sub rdx,rax
M00_L03:
cmp rdx,1
jle short M00_L05
lea rax,[rcx+rax*4]
lea rdx,[rax+rdx*4-4]
M00_L04:
mov ecx,[rax]
mov r8d,[rdx]
mov [rax],r8d
mov [rdx],ecx
add rax,4
add rdx,0FFFFFFFFFFFFFFFC
cmp rax,rdx
jb short M00_L04
M00_L05:
vzeroupper
ret
; Total bytes of code 188
And slightly the perf 👍
This reverts commit 6ddd06c.
nint remainder = (nint)length; | ||
nint offset = 0; | ||
|
||
if (Avx2.IsSupported && remainder >= Vector256<byte>.Count) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The trick with taking the vectorized path for sizes smaller than 2*Vector256.Count is neat, but it hurts performance in number of cases. There can be a lot of redundant work done for certain sizes with this PR.
I have opened #78604 on this.
@yesmey @adamsitnik This change significantly regressed performance for certain sizes that was not accounted for (see #70944 and #78605 for details). Stephen and I reverted it so that we do not have to deal with noise from it. I think the core of change is still good. Could you please revert the revert and augment it to address the regressions? Also, we may want to improve the coverage in dotnet/performance repo. |
@jkotas Thank you. I will try to find a better middle ground when the AVX path is more profitable. I will also make sure to check the other types char/int/long |
)" (dotnet#78605)" This reverts commit de005e5.
Span<T>.Reverse was vectorized earlier this year
This pull request improves the performance for smaller inputs and adds a bswap variant for byte spans.
Notible changes:
Benchmarks:
Span.Reverse<long>
Benchmark result long
Span.Reverse<int>
Benchmark result int
Span.Reverse<char>
Benchmark result char
Span.Reverse<byte>
Benchmark result byte
Benchmark Code