-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add basic support for folding and normalizing hwintrinsic trees in morph #103143
Conversation
Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch |
1de5887
to
49088f6
Compare
49088f6
to
471aa9a
Compare
471aa9a
to
03d5fc1
Compare
938d968
to
ee0b8ff
Compare
7d71366
to
d2203a3
Compare
d2203a3
to
4a80f83
Compare
4a80f83
to
1cca48e
Compare
CC. @dotnet/jit-contrib, much as with the floating-point PR (#103206), this simply enables constant folding of hwintrinsic nodes in morph (making it more inline with how simple integer ops are handled) and expands the support to a few other common operations, such as |
fbe82b9
to
d057ab9
Compare
Updated to handle the same feedback as given in #103206 |
39961a4
to
16d6c64
Compare
16d6c64
to
1f24151
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The changes generally make sense to me, it's just unfortunate that we effectively duplicate constant folding logic here and in VN for the same ops resulting in+2000 LOC of code. Btw any idea why arm64 regresses on SPMI?
Definitely agree. It's something I noticed happens for Maybe we could really just setup some kind of templated API that works with either
It's mostly related to One example of this is There's a couple other places where code has "regressed" as well, but it's typically in edge cases like this that are handling paths we don't expect to execute on Arm64. The paths we actually care about (using @@ -18,35 +18,35 @@
; V07 tmp4 [V07,T07] ( 2, 4 ) long -> x0 "Inlining Arg"
; V08 tmp5 [V08,T03] ( 3, 6 ) long -> x1 "Inlining Arg"
;* V09 tmp6 [V09 ] ( 0, 0 ) struct (32) zero-ref HFA(simd16) ld-addr-op "Inline ldloca(s) first use temp" <System.Runtime.Intrinsics.Vector256`1[ulong]>
-;* V10 tmp7 [V10,T15] ( 0, 0 ) simd16 -> zero-ref HFA(simd16) "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ulong]>
-;* V11 tmp8 [V11,T16] ( 0, 0 ) simd16 -> zero-ref HFA(simd16) "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ulong]>
+;* V10 tmp7 [V10 ] ( 0, 0 ) simd16 -> zero-ref HFA(simd16) "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ulong]>
+;* V11 tmp8 [V11 ] ( 0, 0 ) simd16 -> zero-ref HFA(simd16) "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ulong]>
; V12 tmp9 [V12,T09] ( 3, 3 ) ref -> x21 class-hnd exact single-def "Inline stloc first use temp" <<unknown class>>
; V13 tmp10 [V13,T10] ( 3, 3 ) ref -> x2 class-hnd exact single-def "Inline stloc first use temp" <<unknown class>>
; V14 tmp11 [V14,T08] ( 2, 4 ) long -> x20 "Inlining Arg"
;* V15 tmp12 [V15 ] ( 0, 0 ) byref -> zero-ref "Inlining Arg"
-; V16 tmp13 [V16,T11] ( 3, 3 ) simd16 -> [fp+0x20] HFA(simd16) spill-single-def "field V01._lower (fldOffset=0x0)" P-INDEP
-; V17 tmp14 [V17,T12] ( 3, 3 ) simd16 -> [fp+0x10] HFA(simd16) spill-single-def "field V01._upper (fldOffset=0x10)" P-INDEP
+; V16 tmp13 [V16,T11] ( 3, 3 ) simd16 -> d8 HFA(simd16) "field V01._lower (fldOffset=0x0)" P-INDEP
+; V17 tmp14 [V17,T12] ( 3, 3 ) simd16 -> d9 HFA(simd16) "field V01._upper (fldOffset=0x10)" P-INDEP
;* V18 tmp15 [V18 ] ( 0, 0 ) simd16 -> zero-ref HFA(simd16) "field V02._lower (fldOffset=0x0)" P-INDEP
;* V19 tmp16 [V19 ] ( 0, 0 ) simd16 -> zero-ref HFA(simd16) "field V02._upper (fldOffset=0x10)" P-INDEP
-;* V20 tmp17 [V20,T17] ( 0, 0 ) simd16 -> zero-ref HFA(simd16) "field V09._lower (fldOffset=0x0)" P-INDEP
-;* V21 tmp18 [V21,T18] ( 0, 0 ) simd16 -> zero-ref HFA(simd16) "field V09._upper (fldOffset=0x10)" P-INDEP
+;* V20 tmp17 [V20,T13] ( 0, 0 ) simd16 -> zero-ref HFA(simd16) "field V09._lower (fldOffset=0x0)" P-INDEP
+;* V21 tmp18 [V21,T14] ( 0, 0 ) simd16 -> zero-ref HFA(simd16) "field V09._upper (fldOffset=0x10)" P-INDEP
; V22 tmp19 [V22,T04] ( 3, 6 ) long -> x0 "BlockOp address local"
; V23 tmp20 [V23,T05] ( 3, 6 ) long -> x0 "BlockOp address local"
; V24 tmp21 [V24,T02] ( 3, 6 ) byref -> x0 single-def "BlockOp address local"
-; V25 cse0 [V25,T13] ( 2, 2 ) simd16 -> d17 "CSE #02: aggressive"
-; V26 cse1 [V26,T06] ( 4, 4 ) byref -> x20 "CSE #03: aggressive"
-; V27 cse2 [V27,T14] ( 2, 2 ) simd16 -> d16 "CSE #01: moderate"
+; V25 cse0 [V25,T06] ( 4, 4 ) byref -> x20 "CSE #01: aggressive"
;
-; Lcl frame size = 40
+; Lcl frame size = 8
G_M64353_IG01: ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, nogc <-- Prolog IG
stp fp, lr, [sp, #-0x50]!
+ stp d8, d9, [sp, #0x18]
+ stp d10, d11, [sp, #0x28]
stp x19, x20, [sp, #0x38]
str x21, [sp, #0x48]
mov fp, sp
mov x19, x0
; gcrRegs +[x19]
- ;; size=20 bbWeight=1 PerfScore 4.00
+ ;; size=28 bbWeight=1 PerfScore 6.00
G_M64353_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, byref
movz x0, #0xD1FFAB1E
movk x0, #0xD1FFAB1E LSL #16
@@ -84,26 +84,29 @@ G_M64353_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, by
sub x0, x0, #1
sub x1, x1, #1
bic x0, x0, x1
- ldp q16, q17, [x0]
- stp q17, q16, [fp, #0x10] // [V17 tmp14], [V16 tmp13]
+ ldp q8, q9, [x0]
mov x0, x20
; byrRegs +[x0]
movz x1, #0xD1FFAB1E // code for <unknown method>
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
+ mov v10.d[0], v8.d[1]
+ mov v11.d[0], v9.d[1]
blr x1
; byrRegs -[x0 x21]
- ldr q16, [fp, #0x20] // [V16 tmp13]
- str q16, [x0]
- ldr q17, [fp, #0x10] // [V17 tmp14]
- str q17, [x0, #0x10]
+ mov v8.d[1], v10.d[0]
+ str q8, [x0]
+ mov v9.d[1], v11.d[0]
+ str q9, [x0, #0x10]
mov x0, x20
; byrRegs +[x0]
movz x1, #0xD1FFAB1E // code for <unknown method>
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
+ mov v10.d[0], v8.d[1]
+ mov v11.d[0], v9.d[1]
blr x1
; byrRegs -[x0 x20]
mov x20, x0
@@ -127,10 +130,10 @@ G_M64353_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, by
add x0, x21, #16
; gcrRegs -[x0]
; byrRegs +[x0]
- ldr q16, [fp, #0x20] // [V16 tmp13]
- str q16, [x0]
- ldr q17, [fp, #0x10] // [V17 tmp14]
- str q17, [x0, #0x10]
+ mov v8.d[1], v10.d[0]
+ str q8, [x0]
+ mov v9.d[1], v11.d[0]
+ str q9, [x0, #0x10]
ldp q16, q17, [x20]
stp q16, q17, [x2, #0x10]
mov x0, x19
@@ -147,26 +150,28 @@ G_M64353_IG02: ; bbWeight=1, gcrefRegs=80000 {x19}, byrefRegs=0000 {}, by
ldr x4, [x4]
blr x4
; gcrRegs -[x0-x2 x19 x21]
- ;; size=312 bbWeight=1 PerfScore 77.00
+ ;; size=324 bbWeight=1 PerfScore 76.00
G_M64353_IG03: ; bbWeight=1, epilog, nogc, extend
ldr x21, [sp, #0x48]
ldp x19, x20, [sp, #0x38]
+ ldp d10, d11, [sp, #0x28]
+ ldp d8, d9, [sp, #0x18]
ldp fp, lr, [sp], #0x50
ret lr
- ;; size=16 bbWeight=1 PerfScore 5.00
+ ;; size=24 bbWeight=1 PerfScore 7.00
-; Total bytes of code 348, prolog size 16, PerfScore 86.00, instruction count 87, allocated bytes for code 348 (MethodHash=fd89049e) for method JIT.HardwareIntrinsics.General._Vector256_1.VectorImmBinaryOpTest__op_RightShiftUInt6464:RunLclVarScenario_UnsafeRead():this (FullOpts)
+; Total bytes of code 376, prolog size 24, PerfScore 89.00, instruction count 94, allocated bytes for code 376 (MethodHash=fd89049e) for method JIT.HardwareIntrinsics.General._Vector256_1.VectorImmBinaryOpTest__op_RightShiftUInt6464:RunLclVarScenario_UnsafeRead():this (FullOpts)
; ============================================================
Unwind Info:
>> Start offset : 0x000000 (not in unwind data)
>> End offset : 0xd1ffab1e (not in unwind data)
- Code Words : 2
+ Code Words : 3
Epilog Count : 1
E bit : 0
X bit : 0
Vers : 0
- Function Length : 87 (0x00057) Actual length = 348 (0x00015c)
+ Function Length : 94 (0x0005e) Actual length = 376 (0x000178)
---- Epilog scopes ----
---- Scope 0
Epilog Start Offset : 3523193630 (0xd1ffab1e) Actual offset = 3523193630 (0xd1ffab1e) Offset from main function begin = 3523193630 (0xd1ffab1e)
@@ -176,7 +181,10 @@ Unwind Info:
---- Epilog start at index 1 ----
D0 89 save_reg X#2 Z#9 (0x09); str x21, [sp, #72]
C8 07 save_regp X#0 Z#7 (0x07); stp x19, x20, [sp, #56]
+ E6 save_next
+ D8 03 save_fregp X#0 Z#3 (0x03); stp d8, d9, [sp, #24]
89 save_fplr_x #9 (0x09); stp fp, lr, [sp, #-80]!
E4 end
E4 end
+ E4 end |
Potential regressions: |
The regression are more likely #103462 and caused by the use of |
This covers some, but not all, of the same optimizations that exist for regular GenTreeOp nodes.