dotnet · kunalspathak · Jan 23, 2024 · Jan 19, 2024 · kunalspathak · Jan 22, 2024
diff --git a/src/coreclr/jit/codegenarm64test.cpp b/src/coreclr/jit/codegenarm64test.cpp
@@ -5138,6 +5138,22 @@ void CodeGen::genArm64EmitterUnitTestsSve()
     theEmitter->emitIns_R_R_R(INS_sve_ftssel, EA_SCALABLE, REG_V17, REG_V16, REG_V15,
                               INS_OPTS_SCALABLE_D); // FTSSEL  <Zd>.<T>, <Zn>.<T>, <Zm>.<T>
 
+    // IF_SVE_BL_1A
+    theEmitter->emitIns_R_PATTERN_I(INS_sve_cntb, EA_8BYTE, REG_R0, SVE_PATTERN_POW2,
+                                    1); // CNTB    <Xd>{, <pattern>{, MUL #<imm>}}
+    theEmitter->emitIns_R_PATTERN_I(INS_sve_cntd, EA_8BYTE, REG_R30, SVE_PATTERN_VL1,
+                                    16); // CNTD    <Xd>{, <pattern>{, MUL #<imm>}}
+    theEmitter->emitIns_R_PATTERN_I(INS_sve_cnth, EA_8BYTE, REG_R12, SVE_PATTERN_VL7,
+                                    5); // CNTH    <Xd>{, <pattern>{, MUL #<imm>}}
+    theEmitter->emitIns_R_PATTERN_I(INS_sve_cntw, EA_8BYTE, REG_R23, SVE_PATTERN_VL256,
+                                    7); // CNTW    <Xd>{, <pattern>{, MUL #<imm>}}
+    theEmitter->emitIns_R_PATTERN_I(INS_sve_cntb, EA_8BYTE, REG_R21, SVE_PATTERN_MUL4,
+                                    8); // CNTB    <Xd>{, <pattern>{, MUL #<imm>}}
+    theEmitter->emitIns_R_PATTERN_I(INS_sve_cntd, EA_8BYTE, REG_R15, SVE_PATTERN_MUL3,
+                                    10); // CNTD    <Xd>{, <pattern>{, MUL #<imm>}}
+    theEmitter->emitIns_R_PATTERN_I(INS_sve_cnth, EA_8BYTE, REG_R5, SVE_PATTERN_ALL,
+                                    13); // CNTH    <Xd>{, <pattern>{, MUL #<imm>}}
+
     // IF_SVE_CL_3A
     theEmitter->emitIns_R_R_R(INS_sve_compact, EA_SCALABLE, REG_V16, REG_P7, REG_V13,
                               INS_OPTS_SCALABLE_S); // COMPACT <Zd>.<T>, <Pg>, <Zn>.<T>

diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
@@ -971,19 +971,25 @@ class emitter
                 iiaEncodedInstrCount = (count << iaut_SHIFT) | iaut_INST_COUNT;
             }
 
-#ifdef TARGET_ARMARCH
-
+#ifdef TARGET_ARM
             struct
             {
-#ifdef TARGET_ARM64
-                // For 64-bit architecture this 32-bit structure can pack with these unsigned bit fields
+                regNumber _idReg3 : REGNUM_BITS;
+                regNumber _idReg4 : REGNUM_BITS;
+            };
+#elif defined(TARGET_ARM64)
+            struct
+            {
+                // This 32-bit structure can pack with these unsigned bit fields
                 emitLclVarAddr iiaLclVar;
                 unsigned       _idRegBit : 1; // Reg3 is scaled by idOpSize bits
                 GCtype         _idGCref2 : 2;
-#endif
-                regNumber _idReg3 : REGNUM_BITS;
-                regNumber _idReg4 : REGNUM_BITS;
+                regNumber      _idReg3 : REGNUM_BITS;
+                regNumber      _idReg4 : REGNUM_BITS;
             };
+
+            insSvePattern _idSvePattern;
+
 #elif defined(TARGET_XARCH)
             struct
             {
@@ -1155,7 +1161,7 @@ class emitter
             _idCodeSize = sz;
         }
 #elif defined(TARGET_RISCV64)
-        unsigned  idCodeSize() const
+        unsigned idCodeSize() const
         {
             return _idCodeSize;
         }
@@ -1433,6 +1439,16 @@ class emitter
             assert(!idIsSmallDsc());
             idAddr()->_idRegBit = val ? 1 : 0;
         }
+        insSvePattern idSvePattern() const
+        {
+            assert(!idIsSmallDsc());
+            return (idAddr()->_idSvePattern);
+        }
+        void idSvePattern(insSvePattern idSvePattern)
+        {
+            assert(!idIsSmallDsc());
+            idAddr()->_idSvePattern = idSvePattern;
+        }
 #endif // TARGET_ARM64
 
 #endif // TARGET_ARMARCH

diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp
@@ -1081,6 +1081,14 @@ void emitter::emitInsSanityCheck(instrDesc* id)
             assert(isValidScalarDatasize(elemsize));
             break;
 
+        case IF_SVE_BL_1A: // ............iiii ......pppppddddd -- SVE element count
+            elemsize = id->idOpSize();
+            assert(id->idInsOpt() == INS_OPTS_NONE);
+            assert(isGeneralRegister(id->idReg1()));
+            assert(elemsize == EA_8BYTE);
+            assert(isValidUimm4From1(emitGetInsSC(id)));
+            break;
+
         // Scalable, 4 regs, to predicate register.
         case IF_SVE_CX_4A: // ........xx.mmmmm ...gggnnnnn.DDDD -- SVE integer compare vectors
             elemsize = id->idOpSize();
@@ -1938,6 +1946,19 @@ static const char * const  pnRegNames[] =
     "pn10", "pn11", "pn12", "pn13", "pn14",
     "pn15"
 };
+
+static const char * const  svePatternNames[] =
+{
+    "pow2", "vl1", "vl2", "vl3",
+    "vl4", "vl5", "vl6", "vl7",
+    "vl8", "vl16", "vl32", "vl64",
+    "vl128", "vl256", "invalid", "invalid",
+    "invalid", "invalid", "invalid", "invalid",
+    "invalid", "invalid", "invalid", "invalid",
+    "invalid", "invalid", "invalid", "invalid",
+    "invalid", "mul4", "mul3", "all"
+};
+
 // clang-format on
 
 //------------------------------------------------------------------------
@@ -11360,6 +11381,49 @@ void emitter::emitIns_R_I_FLAGS_COND(
     appendToCurIG(id);
 }
 
+/*****************************************************************************
+ *
+ *  Add an instruction referencing a register, a SVE Pattern and an immediate.
+ */
+
+void emitter::emitIns_R_PATTERN_I(instruction ins, emitAttr attr, regNumber reg1, insSvePattern pattern, int imm)
+{
+    emitAttr  size     = EA_SIZE(attr);
+    emitAttr  elemsize = EA_UNKNOWN;
+    insFormat fmt      = IF_NONE;
+
+    /* Figure out the encoding format of the instruction */
+    switch (ins)
+    {
+        case INS_sve_cntb:
+        case INS_sve_cntd:
+        case INS_sve_cnth:
+        case INS_sve_cntw:
+            assert(isGeneralRegister(reg1));
+            assert(size == EA_8BYTE);
+            assert(isValidUimm4From1(imm));
+            fmt = IF_SVE_BL_1A;
+            break;
+
+        default:
+            unreached();
+            break;
+
+    } // end switch (ins)
+    assert(fmt != IF_NONE);
+
+    instrDesc* id = emitNewInstrCns(attr, imm);
+
+    id->idIns(ins);
+    id->idInsFmt(fmt);
+
+    id->idReg1(reg1);
+    id->idSvePattern(pattern);
+
+    dispIns(id);
+    appendToCurIG(id);
+}
+
 /*****************************************************************************
  *
  *  Add a memory barrier instruction with a 'barrier' immediate
@@ -14538,6 +14602,17 @@ void emitter::emitIns_Call(EmitCallType          callType,
     return (code_t)imm << 14;
 }
 
+/*****************************************************************************
+ *
+ * Returns the encoding for the immediate value as 4-bits starting from 1, at bit locations '19-16'.
+ */
+
+/*static*/ emitter::code_t emitter::insEncodeUimm4From1_19_to_16(ssize_t imm)
+{
+    assert(isValidUimm4From1(imm));
+    return (code_t)(imm - 1) << 16;
+}
+
 /*****************************************************************************
  *
  *  Returns the encoding to select the <R> 4/8-byte width specifier <R>
@@ -14554,6 +14629,15 @@ void emitter::emitIns_Call(EmitCallType          callType,
     return 0;
 }
 
+/*****************************************************************************
+ *
+ * Returns the encoding to select an insSvePattern
+ */
+/*static*/ emitter::code_t emitter::insEncodeSvePattern(insSvePattern pattern)
+{
+    return (code_t)((unsigned)pattern << 5);
+}
+
 BYTE* emitter::emitOutputLoadLabel(BYTE* dst, BYTE* srcAddr, BYTE* dstAddr, instrDescJmp* id)
 {
     instruction ins    = id->idIns();
@@ -16621,6 +16705,16 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             dst += emitOutput_Instr(dst, code);
             break;
 
+        // Immediate and patterm to general purpose.
+        case IF_SVE_BL_1A: // ............iiii ......pppppddddd -- SVE element count
+            imm  = emitGetInsSC(id);
+            code = emitInsCodeSve(ins, fmt);
+            code |= insEncodeReg_Rd(id->idReg1());           // ddddd
+            code |= insEncodeSvePattern(id->idSvePattern()); // ppppp
+            code |= insEncodeUimm4From1_19_to_16(imm);       // iiii
+            dst += emitOutput_Instr(dst, code);
+            break;
+
         // Scalable to general register.
         case IF_SVE_CO_3A: // ........xx...... ...gggmmmmmddddd -- SVE conditionally extract element to general register
         case IF_SVE_CS_3A: // ........xx...... ...gggnnnnnddddd -- SVE extract element to general register
@@ -17848,6 +17942,20 @@ void emitter::emitDispAddrRRExt(regNumber reg1, regNumber reg2, insOpts opt, boo
     printf("]");
 }
 
+/*****************************************************************************
+ *
+ *  Display an insSvePattern
+ */
+void emitter::emitDispSvePattern(insSvePattern pattern, bool addComma)
+{
+    printf("%s", svePatternNames[pattern]);
+
+    if (addComma)
+    {
+        emitDispComma();
+    }
+}
+
 /*****************************************************************************
  *
  *  Display (optionally) the instruction encoding in hex
@@ -19270,6 +19378,17 @@ void emitter::emitDispInsHelp(
             emitDispReg(id->idReg3(), size, false);             // mmmmm
             break;
 
+        case IF_SVE_BL_1A: // ............iiii ......pppppddddd -- SVE element count
+            imm = emitGetInsSC(id);
+            emitDispReg(id->idReg1(), size, true);             // ddddd
+            emitDispSvePattern(id->idSvePattern(), (imm > 1)); // ppppp
+            if (imm > 1)
+            {
+                printf("mul ");
+                emitDispImm(emitGetInsSC(id), false, false); // iiii
+            }
+            break;
+
         // <Zd>.<T>, <Zn>.<T>, <Zm>.D
         case IF_SVE_BG_3A: // ........xx.mmmmm ......nnnnnddddd -- SVE bitwise shift by wide elements (unpredicated)
             emitDispSveReg(id->idReg1(), id->idInsOpt(), true);       // ddddd
@@ -21946,6 +22065,11 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
             result.insLatency    = PERFSCORE_LATENCY_8C;
             break;
 
+        case IF_SVE_BL_1A: // ............iiii ......pppppddddd -- SVE element count
+            result.insThroughput = PERFSCORE_THROUGHPUT_2C;
+            result.insLatency    = PERFSCORE_LATENCY_2C;
+            break;
+
         case IF_SVE_BK_3A: // ........xx.mmmmm ......nnnnnddddd -- SVE floating-point trig select coefficient
             result.insThroughput = PERFSCORE_THROUGHPUT_2C;
             result.insLatency    = PERFSCORE_LATENCY_3C;

diff --git a/src/coreclr/jit/emitarm64.h b/src/coreclr/jit/emitarm64.h
@@ -65,6 +65,7 @@ void emitDispShiftedReg(regNumber reg, insOpts opt, ssize_t imm, emitAttr attr);
 void emitDispExtendReg(regNumber reg, insOpts opt, ssize_t imm);
 void emitDispAddrRI(regNumber reg, insOpts opt, ssize_t imm);
 void emitDispAddrRRExt(regNumber reg1, regNumber reg2, insOpts opt, bool isScaled, emitAttr size);
+void emitDispSvePattern(insSvePattern pattern, bool addComma);
 
 /************************************************************************/
 /*  Private members that deal with target-dependent instr. descriptors  */
@@ -526,6 +527,9 @@ static code_t insEncodeSimm5_20_to_16(ssize_t imm);
 // Returns the encoding for the immediate value as 7-bits at bit locations '20-14'.
 static code_t insEncodeUimm7_20_to_14(ssize_t imm);
 
+// Returns the encoding for the immediate value as 4-bits starting from 1, at bit locations '19-16'.
+static code_t insEncodeUimm4From1_19_to_16(ssize_t imm);
+
 // Returns the encoding to select the elemsize for an Arm64 SVE vector instruction plus an immediate.
 // This specifically encodes the field 'tszh:tszl' at bit locations '23-22:9-8'.
 static code_t insEncodeSveShift_23_to_22_9_to_0(emitAttr size, bool isRightShift, size_t imm);
@@ -534,6 +538,9 @@ static code_t insEncodeSveShift_23_to_22_9_to_0(emitAttr size, bool isRightShift
 // for an Arm64 Sve instruction.
 static code_t insEncodeSveElemsize_R_22(emitAttr size);
 
+// Returns the encoding to select an insSvePattern
+static code_t insEncodeSvePattern(insSvePattern pattern);
+
 // Returns true if 'reg' represents an integer register.
 static bool isIntegerRegister(regNumber reg)
 {
@@ -582,6 +589,12 @@ static bool isValidSimm4_MultipleOf32(ssize_t value)
     return (-256 <= value) && (value <= 224) && (value % 32 == 0);
 };
 
+// Returns true if 'value' is a legal unsigned immediate 4 bit encoding, starting from 1 (such as for CNTB).
+static bool isValidUimm4From1(ssize_t value)
+{
+    return (1 <= value) && (value <= 16);
+};
+
 // Returns true if 'value' is a legal unsigned immediate 5 bit encoding (such as for CCMP).
 static bool isValidUimm5(ssize_t value)
 {
@@ -1174,6 +1187,8 @@ void emitIns_R_R_FLAGS_COND(
 
 void emitIns_R_I_FLAGS_COND(instruction ins, emitAttr attr, regNumber reg1, int imm, insCflags flags, insCond cond);
 
+void emitIns_R_PATTERN_I(instruction ins, emitAttr attr, regNumber reg1, insSvePattern pattern, int imm);
+
 void emitIns_BARR(instruction ins, insBarrier barrier);
 
 void emitIns_C(instruction ins, emitAttr attr, CORINFO_FIELD_HANDLE fdlHnd, int offs);

diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h
@@ -322,6 +322,28 @@ enum insScalableOpts : unsigned
     INS_SCALABLE_OPTS_UNPREDICATED_WIDE, // Variants without a predicate and wide elements (eg asr)
 };
 
+// Maps directly to the pattern used in SVE instructions such as cntb.
+enum insSvePattern : unsigned
+{
+    SVE_PATTERN_POW2 = 0,   // The largest power of 2.
+    SVE_PATTERN_VL1 = 1,    // 1 element.
+    SVE_PATTERN_VL2 = 2,    // 2 elements.
+    SVE_PATTERN_VL3 = 3,    // 3 elements.
+    SVE_PATTERN_VL4 = 4,    // 4 elements.
+    SVE_PATTERN_VL5 = 5,    // 5 elements.
+    SVE_PATTERN_VL6 = 6,    // 6 elements.
+    SVE_PATTERN_VL7 = 7,    // 7 elements.
+    SVE_PATTERN_VL8 = 8,    // 8 elements.
+    SVE_PATTERN_VL16 = 9,   // 16 elements.
+    SVE_PATTERN_VL32 = 10,  // 32 elements.
+    SVE_PATTERN_VL64 = 11,  // 64 elements.
+    SVE_PATTERN_VL128 = 12, // 128 elements.
+    SVE_PATTERN_VL256 = 13, // 256 elements.
+    SVE_PATTERN_MUL4 = 29,  // The largest multiple of 3.
+    SVE_PATTERN_MUL3 = 30,  // The largest multiple of 4.
+    SVE_PATTERN_ALL = 31    // All available (implicitly a multiple of two).
+};
+
 enum insCond : unsigned
 {
     INS_COND_EQ,