-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[JitStress] VectorTableLookupExtension fails in jitstress1 #84696
Comments
CC. @kunalspathak |
Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch, @kunalspathak Issue DetailsFor:
We see
and
|
Thanks, I will take a look. Most likely this is happening because delay reg frees are not getting tracked properly. |
Looking at the result and inferring, it seems that the input operand's vector register's upper half are zeroed out. With that only indices between |
Pasting the assembly here. Assembly; Assembly listing for method JIT.HardwareIntrinsics.Arm._AdvSimd.Arm64.VectorLookupExtension_2Test__VectorTableLookupExtensionByte:RunBasicScenario_UnsafeRead():this
; Emitting BLENDED_CODE for generic ARM64 CPU - Windows
; optimized code
; fp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 14 single block inlinees; 0 inlinees without PGO data
; Final local variable assignments
;
; V00 this [V00,T01] ( 5, 5 ) ref -> x19 this class-hnd single-def
; V01 loc0 [V01,T22] ( 2, 2 ) simd16 -> d8 HFA(simd16)
; V02 tmp0 [V02,T16] ( 1, 1 ) int -> [fp+14H] do-not-enreg[V] "GSCookie dummy"
;# V03 OutArgs [V03 ] ( 1, 1 ) struct ( 0) [sp+00H] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V04 tmp2 [V04,T17] ( 2, 4 ) simd16 -> d8 HFA(simd16) "impAppendStmt"
;* V05 tmp3 [V05,T18] ( 0, 0 ) simd16 -> zero-ref HFA(simd16) ptr "impAppendStmt"
;* V06 tmp4 [V06,T03] ( 0, 0 ) struct (32) zero-ref HFA(simd16) do-not-enreg[SF] ld-addr-op ptr "NewObj constructor temp"
;* V07 tmp5 [V07,T19] ( 0, 0 ) simd16 -> zero-ref HFA(simd16) ptr "struct address for call/obj"
;* V08 tmp6 [V08 ] ( 0, 0 ) struct (32) zero-ref HFA(simd16) do-not-enreg[SF] ptr "impAppendStmt"
;* V09 tmp7 [V09 ] ( 0, 0 ) simd16 -> zero-ref HFA(simd16) "struct address for call/obj"
; V10 tmp8 [V10,T07] ( 2, 4 ) long -> x22 "impAppendStmt"
;* V11 tmp9 [V11 ] ( 0, 0 ) long -> zero-ref "impAppendStmt"
;* V12 tmp10 [V12 ] ( 0, 0 ) long -> zero-ref "impAppendStmt"
;* V13 tmp11 [V13 ] ( 0, 0 ) long -> zero-ref "impAppendStmt"
;* V14 tmp12 [V14 ] ( 0, 0 ) ref -> zero-ref class-hnd "Inlining Arg"
;* V15 tmp13 [V15 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
; V16 tmp14 [V16,T02] ( 3, 6 ) byref -> x22 single-def "Inlining Arg"
;* V17 tmp15 [V17 ] ( 0, 0 ) long -> zero-ref ld-addr-op "Inline stloc first use temp"
; V18 tmp16 [V18,T08] ( 2, 4 ) long -> x0 "Inlining Arg"
; V19 tmp17 [V19,T04] ( 3, 6 ) long -> x1 "Inlining Arg"
;* V20 tmp18 [V20 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
; V21 tmp19 [V21,T09] ( 2, 4 ) long -> x0 "Inlining Arg"
;* V22 tmp20 [V22 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
;* V23 tmp21 [V23 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
; V24 tmp22 [V24,T06] ( 2, 4 ) byref -> x24 single-def "Inlining Arg"
;* V25 tmp23 [V25 ] ( 0, 0 ) long -> zero-ref ld-addr-op "Inline stloc first use temp"
; V26 tmp24 [V26,T10] ( 2, 4 ) long -> x23 "Inlining Arg"
; V27 tmp25 [V27,T05] ( 3, 6 ) long -> x24 "Inlining Arg"
; V28 tmp26 [V28,T11] ( 2, 4 ) long -> x25 "argument with side effect"
; V29 tmp27 [V29,T12] ( 2, 4 ) long -> x26 "argument with side effect"
; V30 tmp28 [V30,T13] ( 2, 4 ) long -> x5 "argument with side effect"
; V31 GsCookie [V31 ] ( 1, 1 ) long -> [fp+18H] do-not-enreg[X] addr-exposed "GSSecurityCookie"
; V32 cse0 [V32,T20] ( 2, 2 ) simd16 -> d10 "CSE - stress mode"
; V33 cse1 [V33,T00] ( 11, 11 ) byref -> x21 "CSE - aggressive"
; V34 cse2 [V34,T21] ( 2, 2 ) simd16 -> d12 "CSE - moderate"
; V35 cse3 [V35,T14] ( 3, 3 ) byref -> x23 "CSE - moderate"
; V36 cse4 [V36,T15] ( 3, 3 ) ref -> x20 "CSE - moderate"
;
; Lcl frame size = 16
G_M52190_IG01:
stp fp, lr, [sp, #-0x90]!
stp d8, d9, [sp, #0x20]
stp d10, d11, [sp, #0x30]
stp d12, d13, [sp, #0x40]
stp x19, x20, [sp, #0x50]
stp x21, x22, [sp, #0x60]
stp x23, x24, [sp, #0x70]
stp x25, x26, [sp, #0x80]
mov fp, sp
movz x1, #0xD1FFAB1E
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
movk x1, #0xD1FFAB1E LSL #48
str x1, [fp, #0x18]
mov x19, x0
;; size=60 bbWeight=1 PerfScore 12.00
G_M52190_IG02:
movz x0, #0xD1FFAB1E
movk x0, #0xD1FFAB1E LSL #16
movk x0, #0xD1FFAB1E LSL #32
movz x20, #0xD1FFAB1E
movk x20, #0xD1FFAB1E LSL #16
movk x20, #0xD1FFAB1E LSL #32
mov x1, x20
movz x2, #0xD1FFAB1E // code for System.String:Concat(System.String,System.String):System.String
movk x2, #0xD1FFAB1E LSL #16
movk x2, #0xD1FFAB1E LSL #32
ldr x2, [x2]
blr x2
movz x1, #0xD1FFAB1E // code for System.Console:WriteLine(System.String)
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
blr x1
ldrsb wzr, [x19]
add x21, x19, #88
mov x0, x21
movz x1, #0xD1FFAB1E // code for JIT.HardwareIntrinsics.Arm._AdvSimd.Arm64.VectorLookupExtension_2Test__VectorTableLookupExtensionByte+DataTable:get_inArray0Ptr():ulong:this
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
blr x1
ldr q8, [x0]
mov x22, x21
add x23, x22, #56
mov x0, x23
movz x1, #0xD1FFAB1E // code for System.Runtime.InteropServices.GCHandle:AddrOfPinnedObject():long:this
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
mov v9.d[0], v8.d[1]
blr x1
ldr x1, [x22, #0x28]
add x0, x0, x1
sub x0, x0, #1
sub x1, x1, #1
bic x0, x0, x1
ldr q10, [x0]
mov x0, x21
movz x1, #0xD1FFAB1E // code for JIT.HardwareIntrinsics.Arm._AdvSimd.Arm64.VectorLookupExtension_2Test__VectorTableLookupExtensionByte+DataTable:get_inArray2Ptr():ulong:this
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
mov v11.d[0], v10.d[1]
blr x1
ldr q12, [x0]
mov v10.d[1], v11.d[0]
mov v12.d[1], v13.d[0]
mov x0, x21
movz x1, #0xD1FFAB1E // code for JIT.HardwareIntrinsics.Arm._AdvSimd.Arm64.VectorLookupExtension_2Test__VectorTableLookupExtensionByte+DataTable:get_inArray3Ptr():ulong:this
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
mov v13.d[0], v12.d[1]
blr x1
ldr q16, [x0]
mov v8.d[1], v9.d[0]
mov v17.16b, v10.16b
mov v18.16b, v12.16b
tbx v8.16b, {v17.16b, v18.16b}, v16.16b
mov x0, x21
movz x1, #0xD1FFAB1E // code for JIT.HardwareIntrinsics.Arm._AdvSimd.Arm64.VectorLookupExtension_2Test__VectorTableLookupExtensionByte+DataTable:get_outArrayPtr():ulong:this
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
mov v10.d[0], v8.d[1]
blr x1
mov v8.d[1], v10.d[0]
str q8, [x0]
mov x0, x21
movz x1, #0xD1FFAB1E // code for JIT.HardwareIntrinsics.Arm._AdvSimd.Arm64.VectorLookupExtension_2Test__VectorTableLookupExtensionByte+DataTable:get_inArray0Ptr():ulong:this
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
blr x1
mov x22, x0
mov x24, x21
mov x0, x23
movz x1, #0xD1FFAB1E // code for System.Runtime.InteropServices.GCHandle:AddrOfPinnedObject():long:this
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
blr x1
mov x23, x0
ldr x24, [x24, #0x28]
mov x0, x21
movz x1, #0xD1FFAB1E // code for JIT.HardwareIntrinsics.Arm._AdvSimd.Arm64.VectorLookupExtension_2Test__VectorTableLookupExtensionByte+DataTable:get_inArray2Ptr():ulong:this
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
blr x1
mov x25, x0
mov x0, x21
movz x1, #0xD1FFAB1E // code for JIT.HardwareIntrinsics.Arm._AdvSimd.Arm64.VectorLookupExtension_2Test__VectorTableLookupExtensionByte+DataTable:get_inArray3Ptr():ulong:this
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
blr x1
mov x26, x0
mov x0, x21
movz x1, #0xD1FFAB1E // code for JIT.HardwareIntrinsics.Arm._AdvSimd.Arm64.VectorLookupExtension_2Test__VectorTableLookupExtensionByte+DataTable:get_outArrayPtr():ulong:this
movk x1, #0xD1FFAB1E LSL #16
movk x1, #0xD1FFAB1E LSL #32
ldr x1, [x1]
blr x1
mov x5, x0
mov x3, x25
mov x4, x26
sub x2, x24, #1
add x0, x24, x23
sub x0, x0, #1
bic x2, x0, x2
mov x0, x19
mov x1, x22
mov x6, x20
movz x7, #0xD1FFAB1E // code for JIT.HardwareIntrinsics.Arm._AdvSimd.Arm64.VectorLookupExtension_2Test__VectorTableLookupExtensionByte:ValidateResult(ulong,ulong,ulong,ulong,ulong,System.String):this
movk x7, #0xD1FFAB1E LSL #16
movk x7, #0xD1FFAB1E LSL #32
ldr x7, [x7]
blr x7
movz xip0, #0xD1FFAB1E
movk xip0, #0xD1FFAB1E LSL #16
movk xip0, #0xD1FFAB1E LSL #32
movk xip0, #0xD1FFAB1E LSL #48
ldr xip1, [fp, #0x18]
cmp xip0, xip1
beq G_M52190_IG03
bl CORINFO_HELP_FAIL_FAST
;; size=524 bbWeight=1 PerfScore 130.50
G_M52190_IG03:
ldp x25, x26, [sp, #0x80]
ldp x23, x24, [sp, #0x70]
ldp x21, x22, [sp, #0x60]
ldp x19, x20, [sp, #0x50]
ldp d12, d13, [sp, #0x40]
ldp d10, d11, [sp, #0x30]
ldp d8, d9, [sp, #0x20]
ldp fp, lr, [sp], #0x90
ret lr
;; size=36 bbWeight=1 PerfScore 9.00
; Total bytes of code 620, prolog size 60, PerfScore 213.50, instruction count 155, allocated bytes for code 620 (MethodHash=bbf93421) for method JIT.HardwareIntrinsics.Arm._AdvSimd.Arm64.VectorLookupExtension_2Test__VectorTableLookupExtensionByte:RunBasicScenario_UnsafeRead():this
; ============================================================
; defaultValues: (190, 124, 118, 226, 138, 42, 111, 167, 253, 210, 20, 0, 49, 124, 130, 225)
; firstOp: (192, 163, 112, 25, 117, 254, 2, 192, 78, 94, 137, 159, 14, 248, 198, 150)
; secondOp: (198, 133, 186, 70, 250, 91, 16, 238, 194, 225, 234, 20, 87, 197, 150, 48)
; indices: (27, 34, 19, 12, 13, 13, 24, 22, 30, 28, 32, 32, 31, 36, 12, 1)
; result: (0, 124, 70, 0, 0, 0, 0, 16, 0, 0, 20, 0, 0, 124, 0, 163) |
This seems to be an odd existing problem that was surfaced with the consecutive work because this is the first time that we started using For
When we build
Note that, Refposition This leads to multiple problems, some of them can be seen in this example:
Ideally, we need to carry some semantics to tell that for such situations, consider the use as the field_list instead of individual fields use. I will continue investigating if that will help and feasible. |
I have discovered another problem related to the ordering and seems to be pre-existing for upper vector restore and copy registers. The arrangement of refpositions for consecutive registers exposed it. This will be a long post, but it is worth describing the problem in detail. Typically, if there is a use of a partially-saved vector (vector whose upper-half was saved before the call), we first restore it and then use it. There arises an interesting scenario in presence of restore upper-half of V07 from r15 into r5
use V07 present in r5
restore upper-half of V08 from r8 into r6
use V08 present in r6
restore upper-half of V11 from r11 into r7
use V11 present in r7 The way we insert the restores is by appending them one after another before the use tree node, in our case, restore upper-half of V07 from r15 into r5
restore upper-half of V08 from r8 into r6
restore upper-half of V11 from r11 into r7
use V07 present in r5
use V08 present in r6
use V11 present in r7 Having such arrangement can create problems, but they can get worse and hit easily if we had to add For above example, imagine for restore upper-half of V07 from r12 into r15
restore upper-half of V08 from r8 into r18
restore upper-half of V11 from r11 into r14
copy V07 from r15 to r5
use V07 present in r5
copy V07 from r18 to r6
use V08 present in r6
copy V07 from r14 to r7
use V11 present in r7 Now let's see an example when a problem can arise because of this. Consider the following order in which registers are allocated for set of refpositions. [r9] : Register from where V07 restore happen into home location r8
[r13] : Use V07, but since it was present in r8, generate `copy r8 to r13`
[r11] : Register from where V08 restore happen into home location r10
[r14] : Use V08, but since it was present in r10, generate `copy r10 to r14`
; Note - Below, we can assign r8 because it is dead after copyReg for V07
[r8] : Register from where V11 restore happen into home location r12
[r15] : Use V11, but since it was present in r8, generate `copy r12 to r15` Below is the pseudo-code of how the code should look like ideally. restore upper-half of V07 from r9 into r8
copy V07 from r8 to r13
use V07 present in r13 ; consecutive 1
restore upper-half of V08 from r11 into r10
copy V08 from r10 to r14
use V08 present in r14 ; consecutive 2
; load value from stack into r8
restore upper-half of V11 from r8 into r12
copy V11 from r12 to r15
use V11 present in r15 ; consecutive 3 However, since we insert restores one after another, we end up with this: restore upper-half of V07 from r9 into r8
restore upper-half of V08 from r11 into r10
; load value from stack into r8
restore upper-half of V11 from r8 into r12
copy V07 from r8 to r13 ; <--- PROBLEM. r8 held the value of V11 but we thought we are just copying the value of V08
use V07 present in r13 ; consecutive 1
copy V08 from r10 to r14
use V08 present in r14 ; consecutive 2
copy V11 from r12 to r15
use V11 present in r15 ; consecutive 3 I will check if I can tweak the algorithm to assign the same consecutive register to upper-half positions too, to avoid extra copies. I already added TODO when working on consecutive registers: runtime/src/coreclr/jit/lsraarm64.cpp Lines 93 to 96 in 72586a7
Another possibility is (at least for consecutive register scenarios), mark the registers into which we are restoring the values, and that way they will not used while we are processing the refpositions. In above example, we will basically mark r8 and r10 busy and hence r8 won't be selected to "restore from" register of Other option is to rearrange the restores properly so they happen right before the actual use instead of arranging them next to each other. This seems a viable option, specially for consecutive registers. |
A snapshot from the Jitdump for one of the example that has problem I described above:
|
Fixed by #84824 |
For:
We see
and
The text was updated successfully, but these errors were encountered: