-
Notifications
You must be signed in to change notification settings - Fork 4.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Extend redundant zero init optimization to recognize assignments to GT_OBJ(lcl_addr)
#38314
Conversation
Diff for the example from #38070: ; Assembly listing for method DoubleZero:SingleMethod():System.Threading.Tasks.ValueTask`1[ReadResult]:this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; Final local variable assignments
;
;* V00 this [V00 ] ( 0, 0 ) ref -> zero-ref this class-hnd
; V01 RetBuf [V01,T00] ( 4, 4 ) byref -> rsi
-; V02 loc0 [V02 ] ( 4, 4 ) struct (48) [rsp+0x20] do-not-enreg[XSFB] must-init addr-exposed ld-addr-op
+; V02 loc0 [V02 ] ( 3, 3 ) struct (48) [rsp+0x20] do-not-enreg[XSFB] must-init addr-exposed ld-addr-op
; V03 OutArgs [V03 ] ( 1, 1 ) lclBlk (32) [rsp+0x00] "OutgoingArgSpace"
;* V04 tmp1 [V04 ] ( 0, 0 ) struct (40) zero-ref do-not-enreg[SB] ld-addr-op "Inline ldloca(s) first use temp"
;
; Lcl frame size = 80
G_M9604_IG01:
push rsi
sub rsp, 80
- vzeroupper
vxorps xmm4, xmm4
vmovdqa xmmword ptr [rsp+20H], xmm4
vmovdqa xmmword ptr [rsp+30H], xmm4
vmovdqa xmmword ptr [rsp+40H], xmm4
mov rsi, rdx
- ;; bbWeight=1 PerfScore 5.83
+ ;; bbWeight=1 PerfScore 4.83
G_M9604_IG02:
- xor ecx, ecx
- vxorps xmm0, xmm0
- vmovdqu xmmword ptr [rsp+28H], xmm0
- vmovdqu xmmword ptr [rsp+38H], xmm0
- mov qword ptr [rsp+48H], rcx
mov dword ptr [rsp+20H], -1
lea rcx, [rsp+20H]
call System.Runtime.CompilerServices.AsyncMethodBuilderCore:Start(byref)
lea rcx, bword ptr [rsp+28H]
mov rdx, rsi
call System.Runtime.CompilerServices.AsyncValueTaskMethodBuilder`1[ReadResult][DoubleZero+ReadResult]:get_Task():System.Threading.Tasks.ValueTask`1[ReadResult]:this
mov rax, rsi
- ;; bbWeight=1 PerfScore 8.08
+ ;; bbWeight=1 PerfScore 4.50
G_M9604_IG03:
add rsp, 80
pop rsi
ret
;; bbWeight=1 PerfScore 1.75
-; Total bytes of code 96, prolog size 30, PerfScore 25.97, (MethodHash=85efda7b) for method DoubleZero:SingleMethod():System.Threading.Tasks.ValueTask`1[ReadResult]:this
+; Total bytes of code 70, prolog size 27, PerfScore 18.48, (MethodHash=85efda7b) for method DoubleZero:SingleMethod():System.Threading.Tasks.ValueTask`1[ReadResult]:this
; ============================================================
|
@AndyAyersMS PTAL /cc @dotnet/jit-contrib |
Could also be that PMI is not instantiating enough interesting value classes to show broad applicability here -- you might experiment with adding a value class with a GC ref field to the "types to try" array. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Left a couple of suggestions.
src/coreclr/src/jit/optimizer.cpp
Outdated
{ | ||
lclNum = treeOp->gtOp1->AsLclVarCommon()->GetLclNum(); | ||
} | ||
else if (treeOp->gtOp1->OperIs(GT_OBJ)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we check for GT_BLK
too?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good suggestion, thanks.
…GT_OBJ(lcl_addr)` and `GT_BLK(lcl_addr)` Fixes dotnet#38070.
6ca7b19
to
9d35b6d
Compare
There are a few diffs after adding x64 framework pmi:
|
One of the framework diffs: ; Assembly listing for method TimeSpanParse:TryParse(ReadOnlySpan`1,IFormatProvider,byref):bool
G_M4265_IG01:
push rsi
sub rsp, 80
vzeroupper
vxorps xmm4, xmm4
vmovdqa xmmword ptr [rsp+30H], xmm4
vmovdqa xmmword ptr [rsp+40H], xmm4
mov rsi, r8
;; bbWeight=1 PerfScore 4.83
G_M4265_IG02:
mov r9, bword ptr [rcx]
mov r8d, dword ptr [rcx+8]
- xor eax, eax
- mov qword ptr [rsp+30H], rax
- mov byte ptr [rsp+38H], 0
lea rax, bword ptr [rsp+40H]
mov bword ptr [rax], r9
mov dword ptr [rax+8], r8d
G_M4265_IG03:
vmovdqu xmm0, xmmword ptr [rcx]
vmovdqu xmmword ptr [rsp+20H], xmm0
G_M4265_IG04:
lea rcx, bword ptr [rsp+20H]
lea r9, [rsp+30H]
mov r8, rdx
mov edx, 3
call TimeSpanParse:TryParseTimeSpan(ReadOnlySpan`1,ubyte,IFormatProvider,byref):bool
test eax, eax
je SHORT G_M4265_IG07
G_M4265_IG05:
mov rax, qword ptr [rsp+30H]
mov qword ptr [rsi], rax
mov eax, 1
G_M4265_IG06:
add rsp, 80
pop rsi
ret
G_M4265_IG07:
xor eax, eax
mov qword ptr [rsi], rax
G_M4265_IG08:
add rsp, 80
pop rsi
ret
-; Total bytes of code 125, prolog size 24, PerfScore 36.83, (MethodHash=8014ef56) for method TimeSpanParse:TryParse(ReadOnlySpan`1,IFormatProvider,byref):bool
+; Total bytes of code 113, prolog size 24, PerfScore 33.38, (MethodHash=8014ef56) for method TimeSpanParse:TryParse(ReadOnlySpan`1,IFormatProvider,byref):bool
|
Not sure why that leg timed out, but all of the jobs completed correctly. So you can go ahead and merge despite that failure. |
Thank you! 😍 |
Fixes #38070.
No diffs in frameworks and benchmarks. Diffs are expected once we start using Roslyn that includes @benadams's change dotnet/roslyn#45262