Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Codegen: Bound check not elided in ref struct Span field index access #72004

Open
Tracked by #109677
gfoidl opened this issue Jul 12, 2022 · 8 comments
Open
Tracked by #109677

Codegen: Bound check not elided in ref struct Span field index access #72004

gfoidl opened this issue Jul 12, 2022 · 8 comments
Assignees
Labels
area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI
Milestone

Comments

@gfoidl
Copy link
Member

gfoidl commented Jul 12, 2022

Repro:

internal ref struct Foo
{
    private readonly Span<int> _span;

    internal Foo(Span<int> span) => _span = span;

    internal void Set(int index)
    {
        if ((uint)index < (uint)_span.Length)
        {
            _span[index] = 42;
        }
    }
}

Codegen (x64):

       sub       rsp,38
       xor       eax,eax
       mov       [rsp+28],rax
       mov       [rsp+30],rax
       mov       rax,[rcx+8]
       test      rax,rax
       je        short M00_L02
       lea       rdx,[rax+10]
       mov       eax,[rax+8]
M00_L00:
       lea       rcx,[rsp+28]
       mov       [rcx],rdx
       mov       [rcx+8],eax
       cmp       dword ptr [rsp+30],3
       jbe       short M00_L01
       lea       rdx,[rsp+28]
       cmp       dword ptr [rdx+8],3        ; redundant check for bound-check -- should be elided
       jbe       short M00_L03
       mov       rax,[rdx]
       mov       dword ptr [rax+0C],2A
M00_L01:
       add       rsp,38
       ret
M00_L02:
       xor       edx,edx
       xor       eax,eax
       jmp       short M00_L00
M00_L03:
       call      CORINFO_HELP_RNGCHKFAIL
       int       3
; Total bytes of code 88

Workaround: fetch the span to a local.

+Span<int> span = _span;
-if ((uint)index < (uint)_span.Length)
+if ((uint)index < (uint)span.Length)
{
-   _span[index] = 42;
+   span[index] = 42;
}

When the index is not constant (here 3), the same issue occurs.

Complete listing of code (C#)
using BenchmarkDotNet.Attributes;

BenchmarkDotNet.Running.BenchmarkRunner.Run<Bench>();

[DisassemblyDiagnoser]
[ShortRunJob]
public class Bench
{
    private readonly int[] _array = new int[100];

    [Benchmark]
    public void NoLocalSpan()
    {
        Foo foo = new(_array);
        foo.Set(3);
    }

    [Benchmark]
    public void LocalSpan()
    {
        Foo1 foo = new(_array);
        foo.Set(3);
    }
}

internal ref struct Foo
{
    private readonly Span<int> _span;

    internal Foo(Span<int> span) => _span = span;

    internal void Set(int index)
    {
        if ((uint)index < (uint)_span.Length)
        {
            _span[index] = 42;
        }
    }
}

internal ref struct Foo1
{
    private readonly Span<int> _span;

    internal Foo1(Span<int> span) => _span = span;

    internal void Set(int index)
    {
        Span<int> span = _span;
        if ((uint)index < (uint)span.Length)
        {
            span[index] = 42;
        }
    }
}
Complete listing of generated code
; Bench.NoLocalSpan()
       sub       rsp,38
       xor       eax,eax
       mov       [rsp+28],rax
       mov       [rsp+30],rax
       mov       rax,[rcx+8]
       test      rax,rax
       je        short M00_L02
       lea       rdx,[rax+10]
       mov       eax,[rax+8]
M00_L00:
       lea       rcx,[rsp+28]
       mov       [rcx],rdx
       mov       [rcx+8],eax
       cmp       dword ptr [rsp+30],3
       jbe       short M00_L01
       lea       rdx,[rsp+28]
       cmp       dword ptr [rdx+8],3
       jbe       short M00_L03
       mov       rax,[rdx]
       mov       dword ptr [rax+0C],2A
M00_L01:
       add       rsp,38
       ret
M00_L02:
       xor       edx,edx
       xor       eax,eax
       jmp       short M00_L00
M00_L03:
       call      CORINFO_HELP_RNGCHKFAIL
       int       3
; Total bytes of code 88

; Bench.LocalSpan()
       sub       rsp,18
       xor       eax,eax
       mov       [rsp+8],rax
       mov       [rsp+10],rax
       mov       rax,[rcx+8]
       test      rax,rax
       je        short M00_L02
       lea       rdx,[rax+10]
       mov       eax,[rax+8]
M00_L00:
       lea       rcx,[rsp+8]
       mov       [rcx],rdx
       mov       [rcx+8],eax
       lea       rdx,[rsp+8]
       mov       rax,[rdx]
       mov       edx,[rdx+8]
       cmp       edx,3
       jbe       short M00_L01
       mov       dword ptr [rax+0C],2A
M00_L01:
       add       rsp,18
       ret
M00_L02:
       xor       edx,edx
       xor       eax,eax
       jmp       short M00_L00
; Total bytes of code 77

Cf. #67448 (comment)

category:cq
theme:bounds-checks
skill-level:intermediate
cost:small
impact:small

@ghost ghost added the untriaged New issue has not been triaged by the area owner label Jul 12, 2022
@dotnet-issue-labeler dotnet-issue-labeler bot added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Jul 12, 2022
@ghost
Copy link

ghost commented Jul 12, 2022

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch
See info in area-owners.md if you want to be subscribed.

Issue Details

Repro:

internal ref struct Foo
{
    private readonly Span<int> _span;

    internal Foo(Span<int> span) => _span = span;

    internal void Set(int index)
    {
        if ((uint)index < (uint)_span.Length)
        {
            _span[index] = 42;
        }
    }
}

Codegen (x64):

       sub       rsp,38
       xor       eax,eax
       mov       [rsp+28],rax
       mov       [rsp+30],rax
       mov       rax,[rcx+8]
       test      rax,rax
       je        short M00_L02
       lea       rdx,[rax+10]
       mov       eax,[rax+8]
M00_L00:
       lea       rcx,[rsp+28]
       mov       [rcx],rdx
       mov       [rcx+8],eax
       cmp       dword ptr [rsp+30],3
       jbe       short M00_L01
       lea       rdx,[rsp+28]
       cmp       dword ptr [rdx+8],3        ; redundant check for bound-check -- should be elided
       jbe       short M00_L03
       mov       rax,[rdx]
       mov       dword ptr [rax+0C],2A
M00_L01:
       add       rsp,38
       ret
M00_L02:
       xor       edx,edx
       xor       eax,eax
       jmp       short M00_L00
M00_L03:
       call      CORINFO_HELP_RNGCHKFAIL
       int       3
; Total bytes of code 88

Workaround: fetch the span to a local.

+Span<int> span = _span;
-if ((uint)index < (uint)_span.Length)
+if ((uint)index < (uint)span.Length)
{
-   _span[index] = 42;
+   span[index] = 42;
}

When the index is not constant (here 3), the same issue occurs.

Complete listing of code (C#)
using BenchmarkDotNet.Attributes;

BenchmarkDotNet.Running.BenchmarkRunner.Run<Bench>();

[DisassemblyDiagnoser]
[ShortRunJob]
public class Bench
{
    private readonly int[] _array = new int[100];

    [Benchmark]
    public void NoLocalSpan()
    {
        Foo foo = new(_array);
        foo.Set(3);
    }

    [Benchmark]
    public void LocalSpan()
    {
        Foo1 foo = new(_array);
        foo.Set(3);
    }
}

internal ref struct Foo
{
    private readonly Span<int> _span;

    internal Foo(Span<int> span) => _span = span;

    internal void Set(int index)
    {
        if ((uint)index < (uint)_span.Length)
        {
            _span[index] = 42;
        }
    }
}

internal ref struct Foo1
{
    private readonly Span<int> _span;

    internal Foo1(Span<int> span) => _span = span;

    internal void Set(int index)
    {
        Span<int> span = _span;
        if ((uint)index < (uint)span.Length)
        {
            span[index] = 42;
        }
    }
}
Complete listing of generated code
; Bench.NoLocalSpan()
       sub       rsp,38
       xor       eax,eax
       mov       [rsp+28],rax
       mov       [rsp+30],rax
       mov       rax,[rcx+8]
       test      rax,rax
       je        short M00_L02
       lea       rdx,[rax+10]
       mov       eax,[rax+8]
M00_L00:
       lea       rcx,[rsp+28]
       mov       [rcx],rdx
       mov       [rcx+8],eax
       cmp       dword ptr [rsp+30],3
       jbe       short M00_L01
       lea       rdx,[rsp+28]
       cmp       dword ptr [rdx+8],3
       jbe       short M00_L03
       mov       rax,[rdx]
       mov       dword ptr [rax+0C],2A
M00_L01:
       add       rsp,38
       ret
M00_L02:
       xor       edx,edx
       xor       eax,eax
       jmp       short M00_L00
M00_L03:
       call      CORINFO_HELP_RNGCHKFAIL
       int       3
; Total bytes of code 88

; Bench.LocalSpan()
       sub       rsp,18
       xor       eax,eax
       mov       [rsp+8],rax
       mov       [rsp+10],rax
       mov       rax,[rcx+8]
       test      rax,rax
       je        short M00_L02
       lea       rdx,[rax+10]
       mov       eax,[rax+8]
M00_L00:
       lea       rcx,[rsp+8]
       mov       [rcx],rdx
       mov       [rcx+8],eax
       lea       rdx,[rsp+8]
       mov       rax,[rdx]
       mov       edx,[rdx+8]
       cmp       edx,3
       jbe       short M00_L01
       mov       dword ptr [rax+0C],2A
M00_L01:
       add       rsp,18
       ret
M00_L02:
       xor       edx,edx
       xor       eax,eax
       jmp       short M00_L00
; Total bytes of code 77

Cf. #67448 (comment)

Author: gfoidl
Assignees: -
Labels:

area-CodeGen-coreclr, untriaged

Milestone: -

@EgorBo
Copy link
Member

EgorBo commented Jul 12, 2022

I don't see any bound checks in Main:
image

@gfoidl
Copy link
Member Author

gfoidl commented Jul 12, 2022

Hm, interesting. Tried it with https://github.com/dotnet/installer version 7.0.100-preview7.22362.1 (win-x64, current main as of now), and there is a bound-check.
Was there a recent change in JIT? How does the codegen look when this method is called -- it's small enough to be inlined, then I saw the bound-check. Didn't look in isolation to that method.

@EgorBo
Copy link
Member

EgorBo commented Jul 12, 2022

Hm, interesting. Tried it with https://github.com/dotnet/installer version 7.0.100-preview7.22362.1 (win-x64, current main as of now), and there is a bound-check. Was there a recent change in JIT? How does the codegen look when this method is called -- it's small enough to be inlined, then I saw the bound-check. Didn't look in isolation to that method.

image

@EgorBo
Copy link
Member

EgorBo commented Jul 12, 2022

hm.. actually it might be PGO... 🤔

@EgorBo
Copy link
Member

EgorBo commented Jul 12, 2022

So, in some cases JIT doesn't eliminate bound checks inside cold blocks - it's pretty normal behavior to me. So an example where it happens in non-cold block would be nice to have.

Example:

internal ref struct Foo
{
    private readonly Span<int> _span;

    internal Foo(Span<int> span) => _span = span;

    [MethodImpl(MethodImplOptions.NoInlining)]
    internal void Set(int index)
    {
        if ((uint)index < (uint)_span.Length)
            _span[index] = 42;
    }
}

public class Program
{
    static void Main()
    {
        Foo f = new Foo(new [] {1,2,3,4});
        for (int i = 0; i < 100; i++)
        {
            f.Set(1);
            Thread.Sleep(16);
        }
    }
}

Here is the codegen for Set in Tier1 with PGO enabled:

; Assembly listing for method Foo:Set(int):this
       8B4108               mov      eax, dword ptr [rcx+8]
       3BD0                 cmp      edx, eax
       730C                 jae      SHORT G_M45610_IG03
       488B01               mov      rax, bword ptr [rcx]
       8BD2                 mov      edx, edx
       C704902A000000       mov      dword ptr [rax+4*rdx], 42
						;; size=19 bbWeight=1    PerfScore 6.50
G_M45610_IG03:              ;; offset=0013H
       C3                   ret

Now let me change the call from f.Set(1) to f.Set(100) so we will never step into the condition inside Set, the codegen will be:

; Assembly listing for method Foo:Set(int):this
       4883EC28             sub      rsp, 40
						;; size=4 bbWeight=1    PerfScore 0.25
G_M45610_IG02:              ;; offset=0004H
       3B5108               cmp      edx, dword ptr [rcx+8]
       7205                 jb       SHORT G_M45610_IG04
						;; size=5 bbWeight=1    PerfScore 4.00
G_M45610_IG03:              ;; offset=0009H
       4883C428             add      rsp, 40
       C3                   ret      
						;; size=5 bbWeight=1    PerfScore 1.25
G_M45610_IG04:              ;; offset=000EH
       3B5108               cmp      edx, dword ptr [rcx+8]
       730E                 jae      SHORT G_M45610_IG05
       488B01               mov      rax, bword ptr [rcx]
       8BD2                 mov      edx, edx
       C704902A000000       mov      dword ptr [rax+4*rdx], 42
       EBE8                 jmp      SHORT G_M45610_IG03
						;; size=19 bbWeight=0    PerfScore 0.00
G_M45610_IG05:              ;; offset=0021H
       E87AAC4B5F           call     CORINFO_HELP_RNGCHKFAIL
       CC                   int3

A stale/generic PGO might bake profile that doesn't work well for currently tested app and can mess it up but presumably it's a rare case. cc @AndyAyersMS

@gfoidl
Copy link
Member Author

gfoidl commented Jul 12, 2022

JIT doesn't eliminate bound checks inside cold blocks - it's pretty normal behavior to me

This is what I'd except too. But when run via BenchmarkDotNet it should be hot enough to get "optimized" Tier-1 code.
Also "NoInlining" isn't like real-world usage here (I know you know this, of course).

May this be related to something in BDN and a preview version of .NET?

@EgorBo
Copy link
Member

EgorBo commented Jul 12, 2022

Ok, I think I can reproduce yeah

@EgorBo EgorBo added this to the 8.0.0 milestone Jul 12, 2022
@EgorBo EgorBo removed the untriaged New issue has not been triaged by the area owner label Jul 12, 2022
@EgorBo EgorBo modified the milestones: 8.0.0, Future Apr 24, 2023
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI
Projects
None yet
Development

No branches or pull requests

2 participants