Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vector64: Avoid GSCookies in Vector64 software fallback #98293

Merged
merged 3 commits into from
Mar 19, 2024

Conversation

hez2010
Copy link
Contributor

@hez2010 hez2010 commented Feb 12, 2024

We are using stackalloc for Vector64 software fallbacks which results in GSCookies check in the codegen.
It's slow as the code is not being hardware accelerated but let's at least initialize Vector64 directly to avoid the unnecessary check and skip the unnecessary zeroing.

As a sequence it also makes the software fallback of Vector128, Vector256 and Vector512 faster as they all rely on Vector64 when creating the vector.

On x86_64:

var x = Vector64.Create(1, 2);
var y = Vector64.Create(2, 3);
Console.WriteLine(Vector64.Sum(x + y));

Before:

G_M000_IG01:                ;; offset=0x0000
       sub      rsp, 104
       xor      eax, eax
       mov      qword ptr [rsp+0x28], rax
       vxorps   xmm4, xmm4, xmm4
       vmovdqu  ymmword ptr [rsp+0x30], ymm4
       mov      qword ptr [rsp+0x50], rax
       mov      rax, 0xF2C39AD32FFF
       mov      qword ptr [rsp+0x60], rax

G_M000_IG02:                ;; offset=0x0029
       mov      dword ptr [rsp+0x30], 1
       mov      dword ptr [rsp+0x34], 2
       mov      rcx, qword ptr [rsp+0x30]
       mov      dword ptr [rsp+0x28], 2
       mov      dword ptr [rsp+0x2C], 3
       mov      rax, qword ptr [rsp+0x28]
       mov      qword ptr [rsp+0x48], rcx
       mov      qword ptr [rsp+0x40], rax
       mov      ecx, dword ptr [rsp+0x48]
       add      ecx, dword ptr [rsp+0x40]
       mov      dword ptr [rsp+0x50], ecx
       mov      ecx, dword ptr [rsp+0x4C]
       add      ecx, dword ptr [rsp+0x44]
       mov      dword ptr [rsp+0x54], ecx
       mov      rcx, qword ptr [rsp+0x50]
       mov      qword ptr [rsp+0x38], rcx
       mov      ecx, dword ptr [rsp+0x38]
       add      ecx, dword ptr [rsp+0x3C]
       call     [System.Console:WriteLine(int)]
       mov      rcx, 0xF2C39AD32FFF
       cmp      qword ptr [rsp+0x60], rcx
       je       SHORT G_M000_IG03
       call     CORINFO_HELP_FAIL_FAST

G_M000_IG03:                ;; offset=0x00A3
       nop

G_M000_IG04:                ;; offset=0x00A4
       add      rsp, 104
       ret

; Total bytes of code 169

After:

G_M000_IG01:                ;; offset=0x0000
       sub      rsp, 88
       vxorps   xmm4, xmm4, xmm4
       vmovdqa  xmmword ptr [rsp+0x40], xmm4
       xor      eax, eax
       mov      qword ptr [rsp+0x50], rax

G_M000_IG02:                ;; offset=0x0015
       mov      dword ptr [rsp+0x50], 1
       mov      dword ptr [rsp+0x54], 2
       mov      rcx, qword ptr [rsp+0x50]
       mov      dword ptr [rsp+0x48], 2
       mov      dword ptr [rsp+0x4C], 3
       mov      rax, qword ptr [rsp+0x48]
       mov      qword ptr [rsp+0x38], rcx
       mov      qword ptr [rsp+0x30], rax
       mov      ecx, dword ptr [rsp+0x38]
       add      ecx, dword ptr [rsp+0x30]
       mov      dword ptr [rsp+0x40], ecx
       mov      ecx, dword ptr [rsp+0x3C]
       add      ecx, dword ptr [rsp+0x34]
       mov      dword ptr [rsp+0x44], ecx
       mov      rcx, qword ptr [rsp+0x40]
       mov      qword ptr [rsp+0x28], rcx
       mov      ecx, dword ptr [rsp+0x28]
       add      ecx, dword ptr [rsp+0x2C]
       call     [System.Console:WriteLine(int)]
       nop

G_M000_IG03:                ;; offset=0x007A
       add      rsp, 88
       ret

; Total bytes of code 127

@ghost ghost added the community-contribution Indicates that the PR has been added by a community member label Feb 12, 2024
@ghost
Copy link

ghost commented Feb 12, 2024

Tagging subscribers to this area: @dotnet/area-system-runtime-intrinsics
See info in area-owners.md if you want to be subscribed.

Issue Details

We are using stackalloc for Vector64 software fallbacks which results in GSCookies check in the codegen. It's slow as the code is not being hardware accelerated but let's at least initialize Vector64 directly to avoid the unnecessary check.

On x86_64:

var x = Vector64.Create(1, 2);
var y = Vector64.Create(2, 3);
Console.WriteLine(Vector64.Sum(x + y));

Before:

G_M27646_IG01:  ;; offset=0x0000
       push     rbp
       sub      rsp, 64
       lea      rbp, [rsp+0x40]
       xor      eax, eax
       mov      qword ptr [rbp-0x28], rax
       vxorps   xmm8, xmm8, xmm8
       vmovdqa  xmmword ptr [rbp-0x20], xmm8
       mov      qword ptr [rbp-0x10], rax
       mov      qword ptr [rbp-0x08], 0x430657
G_M27646_IG02:  ;; offset=0x0026
       mov      dword ptr [rbp-0x10], 1
       mov      dword ptr [rbp-0x0C], 2
       mov      rdi, qword ptr [rbp-0x10]
       mov      dword ptr [rbp-0x18], 2
       mov      dword ptr [rbp-0x14], 3
       mov      rax, qword ptr [rbp-0x18]
       mov      qword ptr [rbp-0x30], rdi
       mov      qword ptr [rbp-0x38], rax
       mov      edi, dword ptr [rbp-0x30]
       add      edi, dword ptr [rbp-0x38]
       mov      dword ptr [rbp-0x28], edi
       mov      edi, dword ptr [rbp-0x2C]
       add      edi, dword ptr [rbp-0x34]
       mov      dword ptr [rbp-0x24], edi
       mov      rdi, qword ptr [rbp-0x28]
       mov      qword ptr [rbp-0x40], rdi
       mov      edi, dword ptr [rbp-0x40]
       add      edi, dword ptr [rbp-0x3C]
       call     [System.Console:WriteLine(int)]
       cmp      qword ptr [rbp-0x08], 0x430657
       je       SHORT G_M27646_IG03
       call     CORINFO_HELP_FAIL_FAST
G_M27646_IG03:  ;; offset=0x0087
       nop      
G_M27646_IG04:  ;; offset=0x0088
       add      rsp, 64
       pop      rbp
       ret      
; Total bytes of code 142

After:

G_M000_IG01:                ;; offset=0x0000
       sub      rsp, 88
       vxorps   xmm4, xmm4, xmm4
       vmovdqa  xmmword ptr [rsp+0x40], xmm4
       xor      eax, eax
       mov      qword ptr [rsp+0x50], rax

G_M000_IG02:                ;; offset=0x0015
       mov      dword ptr [rsp+0x50], 1
       mov      dword ptr [rsp+0x54], 2
       mov      rcx, qword ptr [rsp+0x50]
       mov      dword ptr [rsp+0x48], 2
       mov      dword ptr [rsp+0x4C], 3
       mov      rax, qword ptr [rsp+0x48]
       mov      qword ptr [rsp+0x38], rcx
       mov      qword ptr [rsp+0x30], rax
       mov      ecx, dword ptr [rsp+0x38]
       add      ecx, dword ptr [rsp+0x30]
       mov      dword ptr [rsp+0x40], ecx
       mov      ecx, dword ptr [rsp+0x3C]
       add      ecx, dword ptr [rsp+0x34]
       mov      dword ptr [rsp+0x44], ecx
       mov      rcx, qword ptr [rsp+0x40]
       mov      qword ptr [rsp+0x28], rcx
       mov      ecx, dword ptr [rsp+0x28]
       add      ecx, dword ptr [rsp+0x2C]
       call     [System.Console:WriteLine(int)]
       nop

G_M000_IG03:                ;; offset=0x007A
       add      rsp, 88
       ret

; Total bytes of code 127
Author: hez2010
Assignees: -
Labels:

area-System.Runtime.Intrinsics, community-contribution

Milestone: -

Copy link
Member

@tannergooding tannergooding left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM. Just noting that this doesn't really benefit anything as the fallbacks aren't used for most platforms nor should the types be used if IsHardwareAccelerated is false.

@tannergooding tannergooding merged commit 07c99ab into dotnet:main Mar 19, 2024
146 of 150 checks passed
@github-actions github-actions bot locked and limited conversation to collaborators Apr 18, 2024
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
area-System.Runtime.Intrinsics community-contribution Indicates that the PR has been added by a community member
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants