JIT ARM64: Add IF_SVE_DT_3A, IF_SVE_DU_3A, IF_SVE_DX_3A, IF_SVE_DY_3A #96201

amanasifkhalid · 2023-12-19T21:13:46Z

Part of #94549. cc @dotnet/arm64-contrib, @a74nh.

JitDisasm output:

whilege p0.b, w0, w1
whilege p1.b, x2, x3
whilegt p2.b, w4, w5
whilegt p3.b, x6, x7
whilehi p4.h, w8, w9
whilehi p5.h, x10, x11
whilehs p6.h, w12, w13
whilehs p7.h, x14, x15
whilele p8.s, w0, w1
whilele p9.s, x2, x3
whilelo p10.s, w4, w5
whilelo p11.s, x6, x7
whilels p12.d, w8, w9
whilels p13.d, x10, x11
whilelt p14.d, w12, w13
whilelt p15.d, x14, x15
whilerw p0.b, x0, x1
whilerw p1.h, x2, x3
whilerw p2.s, x4, x5
whilerw p3.d, x6, x7
whilewr p4.b, x8, x9
whilewr p5.h, x10, x11
whilewr p6.s, x12, x13
whilewr p7.d, x14, x15
whilege { p0.b, p1.b }, x0, x1
whilegt { p2.b, p3.b }, x2, x3
whilehi { p4.h, p5.h }, x4, x5
whilehs { p6.h, p7.h }, x6, x7
whilele { p8.s, p9.s }, x8, x9
whilelo { p10.s, p11.s }, x10, x11
whilels { p12.d, p13.d }, x12, x13
whilelt { p14.d, p15.d }, x14, x15
whilege p8.b, x0, x1, vlx2
whilege p9.b, x2, x3, vlx4
whilegt p10.h, x4, x5, vlx2
whilegt p11.h, x6, x7, vlx4
whilehi p12.s, x8, x9, vlx2
whilehi p13.s, x10, x11, vlx4
whilehs p14.d, x12, x13, vlx2
whilehs p15.d, x14, x15, vlx4
whilele p8.b, x0, x1, vlx2
whilele p9.b, x2, x3, vlx4
whilelo p10.h, x4, x5, vlx2
whilelo p11.h, x6, x7, vlx4
whilels p12.d, x8, x9, vlx2
whilels p13.d, x10, x11, vlx4
whilelt p14.b, x12, x13, vlx2
whilelt p15.b, x14, x15, vlx4

cstool output:

00002125  whilege       p0.b, w0, w1
41102325  whilege       p1.b, x2, x3
92002525  whilegt       p2.b, w4, w5
D3102725  whilegt       p3.b, x6, x7
14096925  whilehi       p4.h, w8, w9
55196B25  whilehi       p5.h, x10, x11
86096D25  whilehs       p6.h, w12, w13
C7196F25  whilehs       p7.h, x14, x15
1804A125  whilele       p8.s, w0, w1
5914A325  whilele       p9.s, x2, x3
8A0CA525  whilelo       p10.s, w4, w5
CB1CA725  whilelo       p11.s, x6, x7
1C0DE925  whilels       p12.d, w8, w9
5D1DEB25  whilels       p13.d, x10, x11
8E05ED25  whilelt       p14.d, w12, w13
CF15EF25  whilelt       p15.d, x14, x15
10302125  whilerw       p0.b, x0, x1
51306325  whilerw       p1.h, x2, x3
9230A525  whilerw       p2.s, x4, x5
D330E725  whilerw       p3.d, x6, x7
04312925  whilewr       p4.b, x8, x9
45316B25  whilewr       p5.h, x10, x11
8631AD25  whilewr       p6.s, x12, x13
C731EF25  whilewr       p7.d, x14, x15
10502125  whilege       { p0.b, p1.b }, x0, x1
53502325  whilegt       { p2.b, p3.b }, x2, x3
95586525  whilehi       { p4.h, p5.h }, x4, x5
D6586725  whilehs       { p6.h, p7.h }, x6, x7
1955A925  whilele       { p8.s, p9.s }, x8, x9
5A5DAB25  whilelo       { p10.s, p11.s }, x10, x11
9D5DED25  whilels       { p12.d, p13.d }, x12, x13
DE55EF25  whilelt       { p14.d, p15.d }, x14, x15
18402125  whilegt       pn8.b, x0, x1, vlx2
59602325  whilegt       pn9.b, x2, x3, vlx4
9A406525  whilegt       pn0xA.h, x4, x5, vlx2
DB606725  whilegt       pn0xB.h, x6, x7, vlx4
1C49A925  whilehi       pn0xC.s, x8, x9, vlx2
5D69AB25  whilehi       pn0xD.s, x10, x11, vlx4
9E49ED25  whilehi       pn0xE.d, x12, x13, vlx2
DF69EF25  whilehi       pn0xF.d, x14, x15, vlx4
18442125  whilele       pn8.b, x0, x1, vlx2
59642325  whilele       pn9.b, x2, x3, vlx4
9A4C6525  whilels       pn0xA.h, x4, x5, vlx2
DB6C6725  whilels       pn0xB.h, x6, x7, vlx4
1C4DE925  whilels       pn0xC.d, x8, x9, vlx2
5D6DEB25  whilels       pn0xD.d, x10, x11, vlx4
9E452D25  whilele       pn0xE.b, x12, x13, vlx2
DF652F25  whilele       pn0xF.b, x14, x15, vlx4

Notice that for the last few instructions with vector length specifiers, cstool prints the predicate number slightly differently from how we currently do it in the JIT (e.g. pn0xA.h vs p10.h). Should we match cstool's formatting exactly, or is this unimportant in this case?

ghost · 2023-12-19T21:14:02Z

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch
See info in area-owners.md if you want to be subscribed.

Issue Details

Part of #94549. cc @dotnet/arm64-contrib, @a74nh.

JitDisasm output:

whilege p0.b, w0, w1
whilege p1.b, x2, x3
whilegt p2.b, w4, w5
whilegt p3.b, x6, x7
whilehi p4.h, w8, w9
whilehi p5.h, x10, x11
whilehs p6.h, w12, w13
whilehs p7.h, x14, x15
whilele p8.s, w0, w1
whilele p9.s, x2, x3
whilelo p10.s, w4, w5
whilelo p11.s, x6, x7
whilels p12.d, w8, w9
whilels p13.d, x10, x11
whilelt p14.d, w12, w13
whilelt p15.d, x14, x15
whilerw p0.b, x0, x1
whilerw p1.h, x2, x3
whilerw p2.s, x4, x5
whilerw p3.d, x6, x7
whilewr p4.b, x8, x9
whilewr p5.h, x10, x11
whilewr p6.s, x12, x13
whilewr p7.d, x14, x15
whilege { p0.b, p1.b }, x0, x1
whilegt { p2.b, p3.b }, x2, x3
whilehi { p4.h, p5.h }, x4, x5
whilehs { p6.h, p7.h }, x6, x7
whilele { p8.s, p9.s }, x8, x9
whilelo { p10.s, p11.s }, x10, x11
whilels { p12.d, p13.d }, x12, x13
whilelt { p14.d, p15.d }, x14, x15
whilege p8.b, x0, x1, vlx2
whilege p9.b, x2, x3, vlx4
whilegt p10.h, x4, x5, vlx2
whilegt p11.h, x6, x7, vlx4
whilehi p12.s, x8, x9, vlx2
whilehi p13.s, x10, x11, vlx4
whilehs p14.d, x12, x13, vlx2
whilehs p15.d, x14, x15, vlx4
whilele p8.b, x0, x1, vlx2
whilele p9.b, x2, x3, vlx4
whilelo p10.h, x4, x5, vlx2
whilelo p11.h, x6, x7, vlx4
whilels p12.d, x8, x9, vlx2
whilels p13.d, x10, x11, vlx4
whilelt p14.b, x12, x13, vlx2
whilelt p15.b, x14, x15, vlx4

cstool output:

00002125  whilege       p0.b, w0, w1
41102325  whilege       p1.b, x2, x3
92002525  whilegt       p2.b, w4, w5
D3102725  whilegt       p3.b, x6, x7
14096925  whilehi       p4.h, w8, w9
55196B25  whilehi       p5.h, x10, x11
86096D25  whilehs       p6.h, w12, w13
C7196F25  whilehs       p7.h, x14, x15
1804A125  whilele       p8.s, w0, w1
5914A325  whilele       p9.s, x2, x3
8A0CA525  whilelo       p10.s, w4, w5
CB1CA725  whilelo       p11.s, x6, x7
1C0DE925  whilels       p12.d, w8, w9
5D1DEB25  whilels       p13.d, x10, x11
8E05ED25  whilelt       p14.d, w12, w13
CF15EF25  whilelt       p15.d, x14, x15
10302125  whilerw       p0.b, x0, x1
51306325  whilerw       p1.h, x2, x3
9230A525  whilerw       p2.s, x4, x5
D330E725  whilerw       p3.d, x6, x7
04312925  whilewr       p4.b, x8, x9
45316B25  whilewr       p5.h, x10, x11
8631AD25  whilewr       p6.s, x12, x13
C731EF25  whilewr       p7.d, x14, x15
10502125  whilege       { p0.b, p1.b }, x0, x1
53502325  whilegt       { p2.b, p3.b }, x2, x3
95586525  whilehi       { p4.h, p5.h }, x4, x5
D6586725  whilehs       { p6.h, p7.h }, x6, x7
1955A925  whilele       { p8.s, p9.s }, x8, x9
5A5DAB25  whilelo       { p10.s, p11.s }, x10, x11
9D5DED25  whilels       { p12.d, p13.d }, x12, x13
DE55EF25  whilelt       { p14.d, p15.d }, x14, x15
18402125  whilegt       pn8.b, x0, x1, vlx2
59602325  whilegt       pn9.b, x2, x3, vlx4
9A406525  whilegt       pn0xA.h, x4, x5, vlx2
DB606725  whilegt       pn0xB.h, x6, x7, vlx4
1C49A925  whilehi       pn0xC.s, x8, x9, vlx2
5D69AB25  whilehi       pn0xD.s, x10, x11, vlx4
9E49ED25  whilehi       pn0xE.d, x12, x13, vlx2
DF69EF25  whilehi       pn0xF.d, x14, x15, vlx4
18442125  whilele       pn8.b, x0, x1, vlx2
59642325  whilele       pn9.b, x2, x3, vlx4
9A4C6525  whilels       pn0xA.h, x4, x5, vlx2
DB6C6725  whilels       pn0xB.h, x6, x7, vlx4
1C4DE925  whilels       pn0xC.d, x8, x9, vlx2
5D6DEB25  whilels       pn0xD.d, x10, x11, vlx4
9E452D25  whilele       pn0xE.b, x12, x13, vlx2
DF652F25  whilele       pn0xF.b, x14, x15, vlx4

Notice that for the last few instructions with vector length specifiers, cstool prints the predicate number slightly differently from how we currently do it in the JIT (e.g. pn0xA.h vs p10.h). Should we match cstool's formatting exactly, or is this unimportant in this case?

Author:	amanasifkhalid
Assignees:	-
Labels:	`area-CodeGen-coreclr`, `arch-arm64-sve`
Milestone:	-

kunalspathak · 2023-12-19T23:14:19Z

Notice that for the last few instructions with vector length specifiers, cstool prints the predicate number slightly differently from how we currently do it in the JIT (e.g. pn0xA.h vs p10.h). Should we match cstool's formatting exactly, or is this unimportant in this case?

I don't think cstool output is correct. They are printing the register name in hex. @TIHan something to fix.

kunalspathak · 2023-12-19T23:17:48Z

I am seeing some differences in the instruction generated. Perhaps cstool uses alias and we should probably do that too:
left = coreclr
right = cstool

amanasifkhalid · 2023-12-20T00:29:22Z

Perhaps cstool uses alias and we should probably do that too

I can update my changes to use the same aliases as cstool. I'm guessing we'd have to split some of the INST3 groups in instrsarm64sve.h up into separate groups to get the JIT to use different display names depending on the encoding, right? If we do it this way, then I might be able to remove some of the new insOpts I added in this change, as they are causing issues, too. The failing pipelines have to do with the insOpts enum having too many entries to fit in an unsigned type, so trying to set _idInsOpt to something like INS_OPTS_ALIGN (the "largest" insOpts value) results in a narrowing cast that changes _idInsOpt to INS_OPTS_UXTW, thus triggering asserts.

a74nh · 2023-12-20T09:59:13Z

Perhaps cstool uses alias and we should probably do that too

I can update my changes to use the same aliases as cstool. I'm guessing we'd have to split some of the INST3 groups in instrsarm64sve.h up into separate groups to get the JIT to use different display names depending on the encoding, right? If we do it this way, then I might be able to remove some of the new insOpts I added in this change, as they are causing issues, too. The failing pipelines have to do with the insOpts enum having too many entries to fit in an unsigned type, so trying to set _idInsOpt to something like INS_OPTS_ALIGN (the "largest" insOpts value) results in a narrowing cast that changes _idInsOpt to INS_OPTS_UXTW, thus triggering asserts.

Generally we should be using the preferred alias, as it's intended as easier to read.

What we've been doing for other instructions with aliases is just to change ins within the emitIns_() functions.

For cpy -> mov it's an easy switch:

runtime/src/coreclr/jit/emitarm64.cpp

Line 8842 in 8bd23f0

ins = INS_sve_mov;

For cmpXX instructions, it was easier to add an extra switch at the end of the main switch:

runtime/src/coreclr/jit/emitarm64.cpp

Line 9987 in 8bd23f0

case INS_sve_cmple:

a74nh · 2023-12-20T10:30:21Z

However, having said that about aliases, I don't think that's the issue here.

The difference in the output is, for example, WHILEGE -> WHILEGT. Those two can't be aliases of each other. Plus there is no mention of alases for these instructions in the manuals.

Looking at WHILEGE and WHILEGT, the only difference between the encoding of them is that bit 3 (eq) is set on WHILEGE:
https://docsmirror.github.io/A64/2023-06/whilege_pn_rr.html
https://docsmirror.github.io/A64/2023-06/whilegt_pn_rr.html

Which suggests something is going wrong in the coreclr encoding.

Looking at PNd, this is a 3bit field which is used to represent the registers p8 to p15. ie: 000 would be v8, 001 would be v9 etc.

insEncodeReg_P_2_to_0() is wrong. It should be something like:

/*static*/ emitter::code_t emitter::insEncodeReg_P_2_to_0(regNumber reg)
{
    assert(isPredicateRegister(reg));
    emitter::code_t ureg = (emitter::code_t)reg - (emitter::code_t)REG_P0;
    assert((ureg >= 8) && (ureg <= 15));
    return (ureg - 8 ) << 0;
}

Looking at the autogenerated code, ptrue is the only other instruction that requires insEncodeReg_P_2_to_0(), and that uses it in the same way. So it looks like it's safe to change.

amanasifkhalid · 2023-12-20T18:10:16Z

@a74nh thank you for the fix! Changing insEncodeReg_P_2_to_0 fixed the diffs locally.

I'm trying to determine the best way to differentiate these instruction formats in emitIns_R_R_R without introducing new insOpts, as we only give _idInsOpt 6 bits (so 64 possible values) on ARM64, and increasing this allotment doesn't seem trivial. Would we be okay with splitting up these instruction definitions in instrsarm64sve.h? For example, INS_sve_whilege becomes INS_sve_whilege1, INS_sve_whilege2, and INS_sve_whilege3 (or maybe something more descriptive than just a numeric suffix), so now we have different switch cases in emitIns_R_R_R to handle the three different formats. I think we'd still need to differentiate the vl specifier for IF_SVE_DY_3A by the insOpts passed to emitIns_R_R_R, but this will at least reduce the number of new insOpts needed.

Another option is to not split up the instructions, but to (mis-)use existing insOpts values to differentiate the formats, but this would probably be confusing to the reader. Maybe we could hide this usage behind helper methods (like getInsOptsScalableWithPredicatePair(insOpts opt), which takes in e.g. INS_OPTS_SCALABLE_B_WITH_SCALAR and returns INS_OPTS_SCALABLE_B_WITH_PREDICATE_MERGE) to improve readability?

a74nh · 2023-12-20T18:48:12Z

I'm trying to determine the best way to differentiate these instruction formats in emitIns_R_R_R without introducing new insOpts, as we only give _idInsOpt 6 bits (so 64 possible values) on ARM64, and increasing this allotment doesn't seem trivial.

I hit the same issue with a previous PR, but managed to reduce the insopts value for that one. I was a little concerned we'd hit it again.

Would we be okay with splitting up these instruction definitions in instrsarm64sve.h? For example, INS_sve_whilege becomes INS_sve_whilege1, INS_sve_whilege2, and INS_sve_whilege3 (or maybe something more descriptive than just a numeric suffix), so now we have different switch cases in emitIns_R_R_R to handle the three different formats. I think we'd still need to differentiate the vl specifier for IF_SVE_DY_3A by the insOpts passed to emitIns_R_R_R, but this will at least reduce the number of new insOpts needed.

Ideally I'd rather avoid this. But we might have to eventually.

Another option is to not split up the instructions, but to (mis-)use existing insOpts values to differentiate the formats, but this would probably be confusing to the reader. Maybe we could hide this usage behind helper methods (like getInsOptsScalableWithPredicatePair(insOpts opt), which takes in e.g. INS_OPTS_SCALABLE_B_WITH_SCALAR and returns INS_OPTS_SCALABLE_B_WITH_PREDICATE_MERGE) to improve readability?

Reducing insOpts seems the better way.

One way would be to turn it from an enum into flags:

    INS_OPTS_SCALABLE_B = 0x1
    INS_OPTS_SCALABLE_H = 0x2
    INS_OPTS_SCALABLE_S = 0x4
    INS_OPTS_SCALABLE_D = 0x8
    INS_OPTS_SCALABLE_WIDE = 0x10
    INS_OPTS_SCALABLE_WITH_SIMD_VECTOR =0x20
    INS_OPTS_SCALABLE_WITH_SIMD_SCALAR = 0x40
    INS_OPTS_SCALABLE_WITH_SCALAR = 0x80

But that's a lot of refactoring code and we might still run out. So, probably not.

There should be no overlap the non-SVE and SVE values. So we should simply be able to restart the enum at 1:

INS_OPTS_SCALABLE_B = 0x1
    INS_OPTS_SCALABLE_H,
    INS_OPTS_SCALABLE_S,
    INS_OPTS_SCALABLE_D,
    INS_OPTS_SCALABLE_WIDE_B = 0x5
    INS_OPTS_SCALABLE_WIDE_H,
    INS_OPTS_SCALABLE_WIDE_S,
 etc

So, INS_OPTS_PRE_INDEX and INS_OPTS_SCALABLE_B are both 1. No other code then needs to change.

amanasifkhalid · 2023-12-20T20:07:48Z

There should be no overlap the non-SVE and SVE values. So we should simply be able to restart the enum at 1:

Good point, thank you for the suggestion! I had to refactor emitDispArrangement to get this to build, but everything seems to be working now. Here's the (sanitized) JitDisasm:

whilege p0.b, w0, w1
whilege p1.b, x2, x3
whilegt p2.b, w4, w5
whilegt p3.b, x6, x7
whilehi p4.h, w8, w9
whilehi p5.h, x10, x11
whilehs p6.h, w12, w13
whilehs p7.h, x14, x15
whilele p8.s, w0, w1
whilele p9.s, x2, x3
whilelo p10.s, w4, w5
whilelo p11.s, x6, x7
whilels p12.d, w8, w9
whilels p13.d, x10, x11
whilelt p14.d, w12, w13
whilelt p15.d, x14, x15
whilerw p0.b, x0, x1
whilerw p1.h, x2, x3
whilerw p2.s, x4, x5
whilerw p3.d, x6, x7
whilewr p4.b, x8, x9
whilewr p5.h, x10, x11
whilewr p6.s, x12, x13
whilewr p7.d, x14, x15
whilege { p0.b, p1.b }, x0, x1
whilegt { p2.b, p3.b }, x2, x3
whilehi { p4.h, p5.h }, x4, x5
whilehs { p6.h, p7.h }, x6, x7
whilele { p8.s, p9.s }, x8, x9
whilelo { p10.s, p11.s }, x10, x11
whilels { p12.d, p13.d }, x12, x13
whilelt { p14.d, p15.d }, x14, x15
whilege p8.b, x0, x1, vlx2
whilege p9.b, x2, x3, vlx4
whilegt p10.h, x4, x5, vlx2
whilegt p11.h, x6, x7, vlx4
whilehi p12.s, x8, x9, vlx2
whilehi p13.s, x10, x11, vlx4
whilehs p14.d, x12, x13, vlx2
whilehs p15.d, x14, x15, vlx4
whilele p8.b, x0, x1, vlx2
whilele p9.b, x2, x3, vlx4
whilelo p10.h, x4, x5, vlx2
whilelo p11.h, x6, x7, vlx4
whilels p12.d, x8, x9, vlx2
whilels p13.d, x10, x11, vlx4
whilelt p14.b, x12, x13, vlx2
whilelt p15.b, x14, x15, vlx4

and the (sanitized) cstool output:

whilege p0.b, w0, w1
whilege p1.b, x2, x3
whilegt p2.b, w4, w5
whilegt p3.b, x6, x7
whilehi p4.h, w8, w9
whilehi p5.h, x10, x11
whilehs p6.h, w12, w13
whilehs p7.h, x14, x15
whilele p8.s, w0, w1
whilele p9.s, x2, x3
whilelo p10.s, w4, w5
whilelo p11.s, x6, x7
whilels p12.d, w8, w9
whilels p13.d, x10, x11
whilelt p14.d, w12, w13
whilelt p15.d, x14, x15
whilerw p0.b, x0, x1
whilerw p1.h, x2, x3
whilerw p2.s, x4, x5
whilerw p3.d, x6, x7
whilewr p4.b, x8, x9
whilewr p5.h, x10, x11
whilewr p6.s, x12, x13
whilewr p7.d, x14, x15
whilege { p0.b, p1.b }, x0, x1
whilegt { p2.b, p3.b }, x2, x3
whilehi { p4.h, p5.h }, x4, x5
whilehs { p6.h, p7.h }, x6, x7
whilele { p8.s, p9.s }, x8, x9
whilelo { p10.s, p11.s }, x10, x11
whilels { p12.d, p13.d }, x12, x13
whilelt { p14.d, p15.d }, x14, x15
whilege pn8.b, x0, x1, vlx2
whilege pn9.b, x2, x3, vlx4
whilegt pn0xA.h, x4, x5, vlx2
whilegt pn0xB.h, x6, x7, vlx4
whilehi pn0xC.s, x8, x9, vlx2
whilehi pn0xD.s, x10, x11, vlx4
whilehs pn0xE.d, x12, x13, vlx2
whilehs pn0xF.d, x14, x15, vlx4
whilele pn8.b, x0, x1, vlx2
whilele pn9.b, x2, x3, vlx4
whilelo pn0xA.h, x4, x5, vlx2
whilelo pn0xB.h, x6, x7, vlx4
whilels pn0xC.d, x8, x9, vlx2
whilels pn0xD.d, x10, x11, vlx4
whilelt pn0xE.b, x12, x13, vlx2
whilelt pn0xF.b, x14, x15, vlx4

The only diffs I see locally are the different predicate register formats at the end (e.g. p10.h vs p0xA.h)

a74nh

LGTM now. Thanks!

(note: I'm away until new year now).

amanasifkhalid · 2023-12-21T12:22:03Z

Thank you for the review, and enjoy your holiday!

kunalspathak

Added few questions and suggestions.

kunalspathak · 2023-12-21T14:49:30Z

src/coreclr/jit/instr.h

-    INS_OPTS_SCALABLE_B,
+    // There should be no overlap between non-SVE and SVE values,
+    // so reset value to 1 here
+    INS_OPTS_SCALABLE_B = 1,


We should move the INS_OPTS_SCALABLE* below INS_OPTS_ALIGN, so there is no gap between INS_OPTS_2D and INS_OPTS_MSL.

kunalspathak · 2023-12-21T14:51:27Z

src/coreclr/jit/codegenarm64test.cpp

@@ -5371,7 +5371,111 @@ void CodeGen::genArm64EmitterUnitTestsSve()
    theEmitter->emitIns_R_R_R(INS_sve_frecpx, EA_SCALABLE, REG_V5, REG_P5, REG_V5,
                              INS_OPTS_SCALABLE_H); // FRECPX  <Zd>.<T>, <Pg>/M, <Zn>.<T>
    theEmitter->emitIns_R_R_R(INS_sve_fsqrt, EA_SCALABLE, REG_V6, REG_P6, REG_V6,
-                              INS_OPTS_SCALABLE_S); // FSQRT   <Zd>.<T>, <Pg>/M, <Zn>.<T>
+                              INS_OPTS_SCALABLE_S); /* FSQRT   <Zd>.<T>, <Pg>/M, <Zn>.<T> */


Suggested change

INS_OPTS_SCALABLE_S); /* FSQRT <Zd>.<T>, <Pg>/M, <Zn>.<T> */

INS_OPTS_SCALABLE_S); // FSQRT <Zd>.<T>, <Pg>/M, <Zn>.<T>

kunalspathak · 2023-12-21T15:02:14Z

src/coreclr/jit/emitarm64.h

+
+inline static bool isHighPredicateRegister(regNumber reg)
+{
+    return (reg > REG_PREDICATE_LOW_LAST) && (reg <= REG_PREDICATE_LAST);


probably good to introduce REG_PREDICATE_HIGH_FIRST and REG_PREDICATE_HIGH_LAST and use it here. It is more explicit that way.

and a static_assert_no_msg(REG_PREDICATE_HIGH_LAST == REG_PREDICATE_LAST).

kunalspathak · 2023-12-21T15:21:15Z

src/coreclr/jit/emitarm64.cpp

@@ -15658,6 +15844,44 @@ void emitter::emitDispArrangement(insOpts opt)
            str = "8b";
            break;
        case INS_OPTS_16B:
+            str = "16b";


any reason why these were added in emitDispArrangement?

Those cases were already in emitDispArrangement, I think the diff just makes it look like they were added in. I moved all the SVE-specific cases to emitDispSveArrangement, and left the remaining cases in emitDispArrangement as-is.

kunalspathak · 2023-12-21T15:21:49Z

src/coreclr/jit/emitarm64.cpp

@@ -15668,10 +15892,6 @@ void emitter::emitDispArrangement(insOpts opt)
        case INS_OPTS_SCALABLE_B_WITH_PREDICATE_MERGE:
            str = "b";
            break;
-        case INS_OPTS_4H:


does removing these have any implications on existing instructions that we added so far? Can you try to compare the output of all instructions jitdisasm vs. cstool with this change?

Can you try to compare the output of all instructions jitdisasm vs. cstool with this change?

Sure thing. I'll post a comment with diffable outputs from both.

amanasifkhalid · 2023-12-21T16:26:22Z

I created ugly but diffable JitDisasm/cstool outputs for the SVE instructions we have so far. I don't see any diffs locally, except for the different predicate register encodings we noted above.

Gist

kunalspathak · 2023-12-22T16:21:34Z

I created ugly but diffable JitDisasm/cstool outputs for the SVE instructions we have so far. I don't see any diffs locally, except for the different predicate register encodings we noted above.

Gist

Thanks for doing this.

kunalspathak · 2023-12-22T16:24:59Z

Looks like there are several test failures because of moving around the enum ordering.

Assert failure(PID 7640 [0x00001dd8], Thread: 4468 [0x1174]): Assertion failed '!insOptsScalable(opt)' in 'Program:AdvSimd_CompareEqual_Vector64_Byte_Zero(System.Runtime.Intrinsics.Vector64`1[ubyte]):System.Runtime.Intrinsics.Vector64`1[ubyte]' during 'Emit code' (IL size 12; hash 0xb87bf868; FullOpts)

    File: D:\a\_work\1\s\src\coreclr\jit\emitarm64.cpp Line: 15838
    Image: C:\h\w\AC960933\p\corerun.exe

amanasifkhalid · 2023-12-22T17:01:18Z

Looks like there are several test failures because of moving around the enum ordering.

Thank you for pointing that out. Those asserts I added -- e.g. assert(!insOptsScalable(opt)) -- are naive to the fact that enum values for SVE and non-SVE insOpts now overlap, so it's possible for a valid insOpts to be confused with an invalid one, thus triggering the assert. I've removed those asserts, though I'm not sure what other state we can assert instead to ensure emitDispSveArrangement is only being used for SVE instructions; the default case for the switch statement will catch these erroneous uses except for when a non-SVE insOpt value overlaps with an SVE one, in which case we'll print the wrong suffix (which might be enough of an indicator in asmdiffs that something is wrong?).

kunalspathak · 2023-12-22T17:45:17Z

I must say that although the insOpts are non-overlapping between SVE and non-SVE, I am little uncomfortable on resetting the enum values because of such bugs or mistakenly passing it around unintentionally resulting in wrong meaning. Just a thought and not for this PR, but I am wondering if we should re-evaluate if we need so many different entries. They are only used while deciding the instruction format in emitIns_R* methods (in addition to the asserts) and wondering if we should have a separate version of emitIns_R* specific to SVE which propagates the information of SIMD_SCALAR vs. SIMD_VECTOR, etc. differently to decide the format to pick?

        case INS_OPTS_SCALABLE_B:
        case INS_OPTS_SCALABLE_WIDE_B:
        case INS_OPTS_SCALABLE_B_WITH_SIMD_SCALAR:
        case INS_OPTS_SCALABLE_B_WITH_SIMD_VECTOR:
        case INS_OPTS_SCALABLE_B_WITH_SCALAR:
        case INS_OPTS_SCALABLE_B_WITH_PREDICATE_MERGE:
            return EA_1BYTE;

Of course, the alternative to increase the size of _idInsOpt from 6 to 7 might be expensive as that will further reduce our ability for representing small consts in smallest possible instrDesc. Can you check the TP cost of doing that in a separate PR?

amanasifkhalid · 2023-12-22T17:52:04Z

wondering if we should have a separate version of emitIns_R* specific to SVE which propagates the information of SIMD_SCALAR vs. SIMD_VECTOR, etc. differently to decide the format to pick?

I like this idea of separating SVE-specific logic out into separate functions. All the new SVE-specific insOpt values are a bit confusing, and I imagine we'll end up adding more if we continue our current strategy, so it would be nice to clean these up. Also while not as important, having SVE-specific paths might be slightly faster in terms of TP (smaller switch statements, etc).

Of course, the alternative to increase the size of _idInsOpt from 6 to 7 might be expensive as that will further reduce our ability for representing small consts in smallest possible instrDesc. Can you check the TP cost of doing that in a separate PR?

Sure thing. I initially tried this strategy, though I ran into some weird build issues from the static instrDesc size/offset checks...

a74nh · 2024-01-09T12:24:05Z

Just a thought and not for this PR, but I am wondering if we should re-evaluate if we need so many different entries. They are only used while deciding the instruction format in emitIns_R* methods (in addition to the asserts)

This is the crucial part - in some cases, once the group is decided, this information in the insOpt is no longer needed. In other cases, the insOpt information is still required later (eg - most of the neon instructions).

I'll write a quick PR that introduces a insGroupOpts which is an additional optional argument to emitIns_R_R_R_R(). This should remove most of the new insOpt entries, and means nothing needs to increase in size.

a74nh · 2024-01-09T16:34:01Z

I'll write a quick PR that introduces a insGroupOpts which is an additional optional argument to emitIns_R_R_R_R(). This should remove most of the new insOpt entries, and means nothing needs to increase in size.

See #96692

kunalspathak · 2024-01-09T19:40:05Z

See #96692

I will wait for this to merge before reviewing this PR.

a74nh · 2024-01-12T10:49:01Z

See #96692

This is now merged.

All your InsOpt entries should be movable into InsScalableOpts. Note, this new enum is not stored in instr. So, you can't assert it in the SanityCheck function. It's only used to determine which encoding group to use in EmitIns_R_R_R_()

amanasifkhalid · 2024-01-12T23:54:57Z

@a74nh thank you for cleaning things up! I've merged your changes in, and updated this gist with the JitDisasm and cstool outputs. They look the same, except for the cstool issue with printing predicate register numbers as hex.

cc @kunalspathak

kunalspathak · 2024-01-15T07:13:34Z

src/coreclr/jit/instr.h

+
+    // There should be no overlap between non-SVE and SVE values,
+    // so reset value to 1 here
+    INS_OPTS_SCALABLE_B = 1,


we don't need resetting anymore. also, you can pretty much revert changes done in enum insOpts

kunalspathak · 2024-01-15T07:20:15Z

src/coreclr/jit/emitarm64.h

@@ -1012,6 +1022,17 @@ inline static bool insScalableOptsNone(insScalableOpts sopt)
    return sopt == INS_SCALABLE_OPTS_NONE;
 }

+inline static bool insScalableOptsWithPredicatePair(insScalableOpts sopt)


looks like we missed the summary header for newer SVE methods we added recently. do you mind adding those?

kunalspathak · 2024-01-15T07:26:42Z

src/coreclr/jit/emitarm64.cpp

@@ -16362,7 +16499,6 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
            {
                code = insEncodeSveElemsize_dtype(ins, optGetSveElemsize(id->idInsOpt()), code);
            }
-


extra line deletion

a74nh

LGTM (once other review comments are fixed up)

amanasifkhalid · 2024-01-15T17:05:31Z

Thank you for the reviews! @kunalspathak I've addressed your feedback.

kunalspathak

LGTM

…dotnet#96201)

amanasifkhalid added 3 commits December 19, 2023 15:23

Add IF_SVE_DT_3A/IF_SVE_DU_3A/IF_SVE_DX_3A/IF_SVE_DY_3A

d8af5cd

Merge from main

067612f

Print vl

acb2677

amanasifkhalid added the arm-sve Work related to arm64 SVE/SVE2 support label Dec 19, 2023

dotnet-issue-labeler bot added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Dec 19, 2023

ghost assigned amanasifkhalid Dec 19, 2023

amanasifkhalid mentioned this pull request Dec 19, 2023

Arm64: Implement SVE encodings #94549

Closed

amanasifkhalid added 2 commits December 19, 2023 16:34

Add insOptsScalableWithVectorLength to insOptsScalable

2c00381

Style

a8bcb13

Fix insEncodeReg_P_2_to_0

d809682

Reset insOpts enum value for SVE formats

dc66c6e

build-analysis bot mentioned this pull request Dec 21, 2023

Tracking issue for CI build timeouts #76454

Closed

a74nh approved these changes Dec 21, 2023

View reviewed changes

kunalspathak requested changes Dec 21, 2023

View reviewed changes

ghost added needs-author-action An issue or pull request that requires more info or actions from the author. and removed needs-author-action An issue or pull request that requires more info or actions from the author. labels Dec 21, 2023

Feedback

d259e77

build-analysis bot mentioned this pull request Dec 21, 2023

MSBuild crashing in the build #92290

Open

Remove asserts

0202549

build-analysis bot mentioned this pull request Dec 22, 2023

System.Tests.StringTests failing on windows Release NativeAOT_Libraries leg #96279

Closed

Merge insScalableOpts in

079779b

Fix typo

f5d0fa0

kunalspathak reviewed Jan 15, 2024

View reviewed changes

a74nh approved these changes Jan 15, 2024

View reviewed changes

Feedback

3771530

build-analysis bot mentioned this pull request Jan 15, 2024

Test failure - System.NullReferenceException in System.Threading.Lock.TryInitializeStatics #94728

Closed

kunalspathak approved these changes Jan 16, 2024

View reviewed changes

amanasifkhalid merged commit 0339a41 into dotnet:main Jan 16, 2024
127 of 129 checks passed

tmds pushed a commit to tmds/runtime that referenced this pull request Jan 23, 2024

JIT ARM64: Add IF_SVE_DT_3A, IF_SVE_DU_3A, IF_SVE_DX_3A, IF_SVE_DY_3A (…

9c6afbe

…dotnet#96201)

github-actions bot locked and limited conversation to collaborators Feb 15, 2024

	INS_OPTS_SCALABLE_S); /* FSQRT <Zd>.<T>, <Pg>/M, <Zn>.<T> */
	INS_OPTS_SCALABLE_S); // FSQRT <Zd>.<T>, <Pg>/M, <Zn>.<T>

JIT ARM64: Add IF_SVE_DT_3A, IF_SVE_DU_3A, IF_SVE_DX_3A, IF_SVE_DY_3A #96201

JIT ARM64: Add IF_SVE_DT_3A, IF_SVE_DU_3A, IF_SVE_DX_3A, IF_SVE_DY_3A #96201

Conversation

amanasifkhalid commented Dec 19, 2023

ghost commented Dec 19, 2023

kunalspathak commented Dec 19, 2023

kunalspathak commented Dec 19, 2023

amanasifkhalid commented Dec 20, 2023

a74nh commented Dec 20, 2023

a74nh commented Dec 20, 2023

amanasifkhalid commented Dec 20, 2023

a74nh commented Dec 20, 2023

amanasifkhalid commented Dec 20, 2023

a74nh left a comment

Choose a reason for hiding this comment

amanasifkhalid commented Dec 21, 2023

kunalspathak left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

amanasifkhalid commented Dec 21, 2023 • edited Loading

kunalspathak commented Dec 22, 2023

kunalspathak commented Dec 22, 2023

amanasifkhalid commented Dec 22, 2023

kunalspathak commented Dec 22, 2023

amanasifkhalid commented Dec 22, 2023

a74nh commented Jan 9, 2024

a74nh commented Jan 9, 2024

kunalspathak commented Jan 9, 2024

a74nh commented Jan 12, 2024

amanasifkhalid commented Jan 12, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

a74nh left a comment

Choose a reason for hiding this comment

amanasifkhalid commented Jan 15, 2024

kunalspathak left a comment

Choose a reason for hiding this comment

amanasifkhalid commented Dec 21, 2023 •

edited

Loading