JIT: Optimize const ShiftRightLogical for byte values on XArch #86841

MihaZupan · 2023-05-27T23:48:52Z

Contributes to #82564

(First time touching the JIT so I don't really know what I'm doing)

Vector128<byte> v0;

// Expands patterns like
v0 >>> 4;
// to
(v0.AsInt32() >>> 4).AsByte() & Vector128.Create((byte)15);

// and
v0 >>> nonConstAmount;
// to
int maskedShiftAmount = nonConstAmount & 7;
Vector128<int> shiftVector = Sse2.ConvertScalarToVector128Int32(maskedShiftAmount);
Vector128<byte> shiftedInput = Sse2.ShiftRightLogical(v0.AsInt32(), shiftVector).AsByte();
return shiftedInput & Vector128.Create((byte)(255 >> maskedShiftAmount));

Codegen for

[Benchmark]
[Arguments(42)]
public Vector128<byte> SimpleShift(byte input) =>
    Vector128.Create(input) >>> 4;

Before

; VectorTest.SimpleShift(Byte)
       sub       rsp,48
       vzeroupper
       movzx     eax,r8b
       vpbroadcastb xmm0,eax
       vmovaps   [rsp+30],xmm0
       mov       rax,[rsp+30]
       mov       [rsp+20],rax
       movzx     eax,byte ptr [rsp+20]
       shr       eax,4
       movzx     eax,al
       mov       [rsp+28],al
       movzx     eax,byte ptr [rsp+21]
       shr       eax,4
       movzx     eax,al
       mov       [rsp+29],al
       movzx     eax,byte ptr [rsp+22]
       shr       eax,4
       movzx     eax,al
       mov       [rsp+2A],al
       movzx     eax,byte ptr [rsp+23]
       shr       eax,4
       movzx     eax,al
       mov       [rsp+2B],al
       movzx     eax,byte ptr [rsp+24]
       shr       eax,4
       movzx     eax,al
       mov       [rsp+2C],al
       movzx     eax,byte ptr [rsp+25]
       shr       eax,4
       movzx     eax,al
       mov       [rsp+2D],al
       movzx     eax,byte ptr [rsp+26]
       shr       eax,4
       movzx     eax,al
       mov       [rsp+2E],al
       movzx     eax,byte ptr [rsp+27]
       shr       eax,4
       movzx     eax,al
       mov       [rsp+2F],al
       mov       rax,[rsp+28]
       mov       rcx,[rsp+38]
       mov       [rsp+10],rcx
       movzx     ecx,byte ptr [rsp+10]
       shr       ecx,4
       movzx     ecx,cl
       mov       [rsp+18],cl
       movzx     ecx,byte ptr [rsp+11]
       shr       ecx,4
       movzx     ecx,cl
       mov       [rsp+19],cl
       movzx     ecx,byte ptr [rsp+12]
       shr       ecx,4
       movzx     ecx,cl
       mov       [rsp+1A],cl
       movzx     ecx,byte ptr [rsp+13]
       shr       ecx,4
       movzx     ecx,cl
       mov       [rsp+1B],cl
       movzx     ecx,byte ptr [rsp+14]
       shr       ecx,4
       movzx     ecx,cl
       mov       [rsp+1C],cl
       movzx     ecx,byte ptr [rsp+15]
       shr       ecx,4
       movzx     ecx,cl
       mov       [rsp+1D],cl
       movzx     ecx,byte ptr [rsp+16]
       shr       ecx,4
       movzx     ecx,cl
       mov       [rsp+1E],cl
       movzx     ecx,byte ptr [rsp+17]
       shr       ecx,4
       movzx     ecx,cl
       mov       [rsp+1F],cl
       mov       rcx,[rsp+18]
       mov       [rsp],rax
       mov       [rsp+8],rcx
       vmovaps   xmm0,[rsp]
       vmovups   [rdx],xmm0
       mov       rax,rdx
       add       rsp,48
       ret
; Total bytes of code 319

After

; VectorTest.SimpleShift(Byte)
       vzeroupper
       movzx     eax,r8b
       vpbroadcastb xmm0,eax
       vpsrld    xmm0,xmm0,4
       vpand     xmm0,xmm0,[7FFC6F58A2E0]
       vmovups   [rdx],xmm0
       mov       rax,rdx
       ret
; Total bytes of code 34

Codegen for

[Benchmark]
[Arguments(3)]
public Vector128<byte> SimpleShift(int shiftCount) =>
    Vector128.Create((byte)42) >>> shiftCount;

Before

; VectorTest.SimpleShift(Int32)
       sub       rsp,58
       vzeroupper
       vmovups   xmm0,[7FFC74E11BB0]
       vmovaps   [rsp+40],xmm0
       mov       rax,[rsp+40]
       mov       [rsp+20],rax
       xor       eax,eax
       mov       ecx,r8d
       and       ecx,7
M00_L00:
       lea       r9,[rsp+20]
       movsxd    r10,eax
       movzx     r9d,byte ptr [r9+r10]
       shrx      r9d,r9d,ecx
       movzx     r9d,r9b
       lea       r11,[rsp+28]
       mov       [r11+r10],r9b
       inc       eax
       cmp       eax,8
       jl        short M00_L00
       vmovaps   [rsp+30],xmm0
       mov       rax,[rsp+28]
       mov       rcx,[rsp+38]
       mov       [rsp+10],rcx
       xor       ecx,ecx
       and       r8d,7
M00_L01:
       lea       r9,[rsp+10]
       movsxd    r10,ecx
       movzx     r9d,byte ptr [r9+r10]
       shrx      r9d,r9d,r8d
       movzx     r9d,r9b
       lea       r11,[rsp+18]
       mov       [r11+r10],r9b
       inc       ecx
       cmp       ecx,8
       jl        short M00_L01
       mov       rcx,[rsp+18]
       mov       [rsp],rax
       mov       [rsp+8],rcx
       vmovaps   xmm0,[rsp]
       vmovups   [rdx],xmm0
       mov       rax,rdx
       add       rsp,58
       ret
; Total bytes of code 173

After

; VectorTest.SimpleShift(Int32)
       vzeroupper
       and       r8d,7
       vmovd     xmm0,r8d
       vmovups   xmm1,[7FFCEB69A330]
       vpsrld    xmm0,xmm1,xmm0
       mov       eax,0FF
       shrx      eax,eax,r8d
       vpbroadcastb xmm1,eax
       vpand     xmm0,xmm0,xmm1
       vmovups   [rdx],xmm0
       mov       rax,rdx
       ret
; Total bytes of code 52

ghost · 2023-05-27T23:49:07Z

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch
See info in area-owners.md if you want to be subscribed.

Issue Details

Contributes to #82564

(First time touching the JIT so I don't really know what I'm doing)

Vector128<byte> v0;

// Expands patterns like
v0 >>> 4;
// to
(v0.AsInt32() >>> 4).AsByte() & Vector128.Create((byte)15);

Author:	MihaZupan
Assignees:	MihaZupan
Labels:	`area-CodeGen-coreclr`
Milestone:	8.0.0

tannergooding · 2023-05-30T13:38:50Z

src/coreclr/jit/hwintrinsicxarch.cpp

+            if (varTypeIsByte(simdBaseType) && !impStackTop(0).val->IsCnsIntOrI())
            {
-                // byte and sbyte would require more work to support
+                // non-constant byte and sbyte would require more work to support
                break;
            }


We shouldn't need this limitation. For shift in particular, working with constant or non-constant data is the same since we have overloads that take byte shiftAmount and overloads that take Vector128<T> shiftAmount with the latter simply being Vector128.CreateScalarUnsafe(shiftAmount) of the non-constant byte.

That handling of constant vs non-constant is general and simply part of the existing gtNewSimdBinOp handling, so you shouldn't need to do anything additional really.

We'll already have appropriately masked off the shift amount to be 0-7 and converted it to a Vector128 if necessary; so you should just need to generate the shift followed by the and to mask off the upper n-bits of each element.

It basically just becomes that if its a constant, you can generate a new icon directly and otherwise the mask is gtNewSimdBroadcastNode of 0xFF >> shiftAmount

I updated it now to handle non-const shift amounts as well, but it feels a bit hacky with the temporary nonConstantByteShiftCountOp - please let me know if I'm on the right track.

Right now the non-const path is something like

static Vector128<byte> Shift(Vector128<byte> input, int shiftAmount) { int maskedShiftAmount = shiftAmount & 7; Vector128<int> shiftVector = Sse2.ConvertScalarToVector128Int32(maskedShiftAmount); Vector128<byte> shiftedInput = Sse2.ShiftRightLogical(input.AsInt32(), shiftVector).AsByte(); return shiftedInput & Vector128.Create((byte)(255 >> maskedShiftAmount)); }

MihaZupan · 2023-07-03T17:16:03Z

(don't mind the referenced issue spam - I'm just testing stuff and this PR is convenient)

tannergooding · 2023-07-13T16:51:34Z

CC. @dotnet/jit-contrib, this needs a secondary review before merging.

MihaZupan · 2023-07-16T08:01:58Z

Test failures were #88582

src/coreclr/jit/gentree.cpp

MihaZupan added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label May 27, 2023

MihaZupan added this to the 8.0.0 milestone May 27, 2023

MihaZupan self-assigned this May 27, 2023

This was referenced May 28, 2023

Failed USB connection via port 54050, error 61, in tvOS arm64 Release AllSubsets_Mono #82637

Open

Timeout in System.Net.Quic.Functional.Tests #86019

Closed

Assert failure in GC/API/NoGCRegion/Callback_Svr test #86612

Closed

runfoapp bot mentioned this pull request May 29, 2023

Infra improvements for Helix #68176

Closed

tannergooding reviewed May 30, 2023

View reviewed changes

MihaZupan force-pushed the jit-byte-rsz branch from 366f44a to 9ce6b22 Compare June 22, 2023 17:25

build-analysis bot mentioned this pull request Jun 22, 2023

400 Bad Request when trying to fetch test run results dotnet/arcade#13848

Closed

3 tasks

Optimize logical right shifts for byte values on XArch

a28e981

MihaZupan force-pushed the jit-byte-rsz branch from 9ce6b22 to a28e981 Compare July 8, 2023 18:17

build-analysis bot mentioned this pull request Jul 8, 2023

simpleruntimeeventvalidation test failing in CI #88499

Closed

MihaZupan requested a review from tannergooding July 12, 2023 23:00

tannergooding approved these changes Jul 13, 2023

View reviewed changes

EgorBo reviewed Jul 17, 2023

View reviewed changes

src/coreclr/jit/gentree.cpp Show resolved Hide resolved

EgorBo approved these changes Jul 17, 2023

View reviewed changes

MihaZupan merged commit 89d435c into dotnet:main Jul 17, 2023
192 of 195 checks passed

MihaZupan added a commit to MihaZupan/runtime that referenced this pull request Jul 23, 2023

Remove >>> workaround now that dotnet#86841 is merged

086512f

MihaZupan added a commit to MihaZupan/runtime that referenced this pull request Aug 7, 2023

Remove >>> workaround now that dotnet#86841 is merged

5f46a3a

tannergooding mentioned this pull request Aug 8, 2023

Vector128.ShiftRightLogical doesn't use machine-instructions for some types (byte / sbyte) #75770

Closed

ghost locked as resolved and limited conversation to collaborators Aug 16, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

JIT: Optimize const ShiftRightLogical for byte values on XArch #86841

JIT: Optimize const ShiftRightLogical for byte values on XArch #86841

MihaZupan commented May 27, 2023 •

edited

Loading

ghost commented May 27, 2023

tannergooding May 30, 2023

tannergooding May 30, 2023

MihaZupan Jun 22, 2023

MihaZupan commented Jul 3, 2023 •

edited

Loading

tannergooding commented Jul 13, 2023

MihaZupan commented Jul 16, 2023

JIT: Optimize const ShiftRightLogical for byte values on XArch #86841

JIT: Optimize const ShiftRightLogical for byte values on XArch #86841

Conversation

MihaZupan commented May 27, 2023 • edited Loading

ghost commented May 27, 2023

tannergooding May 30, 2023

Choose a reason for hiding this comment

tannergooding May 30, 2023

Choose a reason for hiding this comment

MihaZupan Jun 22, 2023

Choose a reason for hiding this comment

MihaZupan commented Jul 3, 2023 • edited Loading

tannergooding commented Jul 13, 2023

MihaZupan commented Jul 16, 2023

MihaZupan commented May 27, 2023 •

edited

Loading

MihaZupan commented Jul 3, 2023 •

edited

Loading