From 0f88d8ec0bc2e7f0fabfe565f67fe3e622ce58b0 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Fri, 19 Jul 2024 07:18:14 -0700 Subject: [PATCH 01/14] Add tracking of FFR register somewhat workable code cleanup Remove FFR Add all the GetFfr* wip Work with MskCns() model Use physReg approach Remove commented prototypes working Remove bunch of unnecessary code Remove SpecialImport from GetFFR/SetFFR/LoadFirstFaulting some more code cleanup some fixup --- src/coreclr/jit/codegenarm64.cpp | 14 +++++ src/coreclr/jit/compiler.h | 4 ++ src/coreclr/jit/gentree.cpp | 10 +++ src/coreclr/jit/hwintrinsic.cpp | 29 +++------ src/coreclr/jit/hwintrinsiccodegenarm64.cpp | 55 +++++++++++++++++ src/coreclr/jit/hwintrinsiclistarm64sve.h | 10 +++ src/coreclr/jit/lclvars.cpp | 3 + src/coreclr/jit/lowerarmarch.cpp | 67 +++++++++++++++++++++ src/coreclr/jit/lsraarm64.cpp | 9 ++- src/coreclr/jit/registerarm64.h | 2 +- src/coreclr/jit/simd.cpp | 23 +++++++ 11 files changed, 205 insertions(+), 21 deletions(-) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 2f6ebaf56fc84..bb161856a3ca1 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -2954,6 +2954,20 @@ void CodeGen::genCodeForStoreLclVar(GenTreeLclVar* lclNode) } #endif // FEATURE_SIMD +#ifdef FEATURE_MASKED_HW_INTRINSICS + if ((compiler->lvaFfrRegister == lclNode->GetLclNum()) && (targetReg == REG_NA)) + { + assert(targetType == TYP_MASK); + // We are about to store the FFR on stack. So first read the FFR register and + // store it in op1Reg. After that, op1Reg will be stored on the stack. + assert(varDsc->lvImplicitlyReferenced == 1); + + regNumber op1Reg = data->GetRegNum(); + assert(op1Reg != REG_NA); + GetEmitter()->emitIns_R(INS_sve_rdffr, EA_SCALABLE, op1Reg); + } +#endif + genConsumeRegs(data); regNumber dataReg = REG_NA; diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 7013184028a87..60a478e16d5fa 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -4344,6 +4344,10 @@ class Compiler #endif // defined(FEATURE_SIMD) unsigned lvaGSSecurityCookie; // LclVar number +#ifdef TARGET_ARM64 + unsigned lvaFfrRegister; // LclVar number + unsigned getFFRegisterVarNum(); +#endif bool lvaTempsHaveLargerOffsetThanVars(); // Returns "true" iff local variable "lclNum" is in SSA form. diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 759dde9b584fb..c7c1cd549d9f8 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -7448,7 +7448,11 @@ GenTreeIntCon* Compiler::gtNewFalse() // return a new node representing the value in a physical register GenTree* Compiler::gtNewPhysRegNode(regNumber reg, var_types type) { +#ifdef TARGET_ARM64 + assert(genIsValidIntReg(reg) || (reg == REG_SPBASE) || (reg == REG_FFR)); +#else assert(genIsValidIntReg(reg) || (reg == REG_SPBASE)); +#endif GenTree* result = new (this, GT_PHYSREG) GenTreePhysReg(reg, type); return result; } @@ -11554,6 +11558,12 @@ void Compiler::gtGetLclVarNameInfo(unsigned lclNum, const char** ilKindOut, cons { ilName = "GsCookie"; } +#ifdef TARGET_ARM64 + else if (lclNum == lvaFfrRegister) + { + ilName = "FFReg"; + } +#endif else if (lclNum == lvaRetAddrVar) { ilName = "ReturnAddress"; diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index 63455226ed2fb..e977b649dc5a2 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -2297,10 +2297,15 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, switch (intrinsic) { - case NI_Sve_CreateBreakAfterMask: case NI_Sve_CreateBreakAfterPropagateMask: - case NI_Sve_CreateBreakBeforeMask: case NI_Sve_CreateBreakBeforePropagateMask: + { + // HWInstrinsic requires a mask for op3 + convertToMaskIfNeeded(retNode->AsHWIntrinsic()->Op(3)); + FALLTHROUGH; + } + case NI_Sve_CreateBreakAfterMask: + case NI_Sve_CreateBreakBeforeMask: case NI_Sve_CreateMaskForFirstActiveElement: case NI_Sve_CreateMaskForNextActiveElement: case NI_Sve_GetActiveElementCount: @@ -2310,30 +2315,16 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, { // HWInstrinsic requires a mask for op2 convertToMaskIfNeeded(retNode->AsHWIntrinsic()->Op(2)); - break; + FALLTHROUGH; } - default: - break; - } - - switch (intrinsic) - { - case NI_Sve_CreateBreakAfterPropagateMask: - case NI_Sve_CreateBreakBeforePropagateMask: { - // HWInstrinsic requires a mask for op3 - convertToMaskIfNeeded(retNode->AsHWIntrinsic()->Op(3)); + // HWInstrinsic requires a mask for op1 + convertToMaskIfNeeded(retNode->AsHWIntrinsic()->Op(1)); break; } - - default: - break; } - // HWInstrinsic requires a mask for op1 - convertToMaskIfNeeded(retNode->AsHWIntrinsic()->Op(1)); - if (HWIntrinsicInfo::IsMultiReg(intrinsic)) { assert(HWIntrinsicInfo::IsExplicitMaskedOperation(retNode->AsHWIntrinsic()->GetHWIntrinsicId())); diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index a93fa678156ca..4141839958c80 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -303,6 +303,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) emitAttr emitSize; insOpts opt; + bool unspilledFfr = false; if (HWIntrinsicInfo::SIMDScalar(intrin.id)) { @@ -318,6 +319,29 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) { emitSize = EA_SCALABLE; opt = emitter::optGetSveInsOpt(emitTypeSize(intrin.baseType)); + + bool readsFfr = false; + switch (intrin.id) + { + case NI_Sve_GetFfrByte: + case NI_Sve_GetFfrInt16: + case NI_Sve_GetFfrInt32: + case NI_Sve_GetFfrInt64: + case NI_Sve_GetFfrSByte: + case NI_Sve_GetFfrUInt16: + case NI_Sve_GetFfrUInt32: + case NI_Sve_GetFfrUInt64: + case NI_Sve_LoadVectorFirstFaulting: + readsFfr = true; + break; + default: + break; + } + + if (readsFfr && (intrin.op1 != nullptr) && ((intrin.op1->gtFlags & GTF_SPILLED) != 0)) + { + unspilledFfr = true; + } } else if (intrin.category == HW_Category_Special) { @@ -2366,6 +2390,37 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_Sve_LoadVectorFirstFaulting: + { + insScalableOpts sopt = (opt == INS_OPTS_SCALABLE_B) ? INS_SCALABLE_OPTS_NONE : INS_SCALABLE_OPTS_LSL_N; + GetEmitter()->emitIns_R_R_R_R(ins, emitSize, targetReg, op1Reg, op2Reg, REG_ZR, opt, sopt); + break; + } + + case NI_Sve_GetFfrByte: + case NI_Sve_GetFfrInt16: + case NI_Sve_GetFfrInt32: + case NI_Sve_GetFfrInt64: + case NI_Sve_GetFfrSByte: + case NI_Sve_GetFfrUInt16: + case NI_Sve_GetFfrUInt32: + case NI_Sve_GetFfrUInt64: + { + if (unspilledFfr) + { + // We have unspilled the FFR in op1Reg. Restore it back in FFR register. + GetEmitter()->emitIns_R(INS_sve_wrffr, emitSize, op1Reg, opt); + } + + GetEmitter()->emitIns_R(ins, emitSize, targetReg, INS_OPTS_SCALABLE_B); + break; + } + case NI_Sve_SetFfr: + { + assert(targetReg == REG_NA); + GetEmitter()->emitIns_R(ins, emitSize, op1Reg, opt); + break; + } case NI_Sve_ConditionalExtractAfterLastActiveElementScalar: case NI_Sve_ConditionalExtractLastActiveElementScalar: { diff --git a/src/coreclr/jit/hwintrinsiclistarm64sve.h b/src/coreclr/jit/hwintrinsiclistarm64sve.h index 46dd188785d19..9d9d8521da688 100644 --- a/src/coreclr/jit/hwintrinsiclistarm64sve.h +++ b/src/coreclr/jit/hwintrinsiclistarm64sve.h @@ -122,6 +122,14 @@ HARDWARE_INTRINSIC(Sve, GatherVectorUInt32WithByteOffsetsZeroExtend, HARDWARE_INTRINSIC(Sve, GatherVectorUInt32ZeroExtend, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1w, INS_sve_ld1w, INS_sve_ld1w, INS_sve_ld1w, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, GatherVectorWithByteOffsets, -1, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1w, INS_sve_ld1w, INS_sve_ld1d, INS_sve_ld1d, INS_sve_ld1w, INS_sve_ld1d}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, GetActiveElementCount, -1, 2, true, {INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_ExplicitMaskedOperation) +HARDWARE_INTRINSIC(Sve, GetFfrByte, -1, -1, false, {INS_invalid, INS_sve_rdffr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialSideEffectMask) +HARDWARE_INTRINSIC(Sve, GetFfrInt16, -1, -1, false, {INS_invalid, INS_invalid, INS_sve_rdffr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialSideEffectMask) +HARDWARE_INTRINSIC(Sve, GetFfrInt32, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_rdffr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialSideEffectMask) +HARDWARE_INTRINSIC(Sve, GetFfrInt64, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_rdffr, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialSideEffectMask) +HARDWARE_INTRINSIC(Sve, GetFfrSByte, -1, -1, false, {INS_sve_rdffr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialSideEffectMask) +HARDWARE_INTRINSIC(Sve, GetFfrUInt16, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_sve_rdffr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialSideEffectMask) +HARDWARE_INTRINSIC(Sve, GetFfrUInt32, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_rdffr, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialSideEffectMask) +HARDWARE_INTRINSIC(Sve, GetFfrUInt64, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_rdffr, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialSideEffectMask) HARDWARE_INTRINSIC(Sve, InsertIntoShiftedVector, -1, 2, true, {INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_HasRMWSemantics) HARDWARE_INTRINSIC(Sve, LeadingSignCount, -1, -1, false, {INS_sve_cls, INS_invalid, INS_sve_cls, INS_invalid, INS_sve_cls, INS_invalid, INS_sve_cls, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, LeadingZeroCount, -1, -1, false, {INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) @@ -142,6 +150,7 @@ HARDWARE_INTRINSIC(Sve, LoadVectorByteZeroExtendToInt64, HARDWARE_INTRINSIC(Sve, LoadVectorByteZeroExtendToUInt16, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1b, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, LoadVectorByteZeroExtendToUInt32, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1b, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, LoadVectorByteZeroExtendToUInt64, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1b, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation) +HARDWARE_INTRINSIC(Sve, LoadVectorFirstFaulting, -1, -1, false, {INS_sve_ldff1b, INS_sve_ldff1b, INS_sve_ldff1h, INS_sve_ldff1h, INS_sve_ldff1w, INS_sve_ldff1w, INS_sve_ldff1d, INS_sve_ldff1d, INS_sve_ldff1w, INS_sve_ldff1d}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_SpecialSideEffectMask) HARDWARE_INTRINSIC(Sve, LoadVectorInt16NonFaultingSignExtendToInt32, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ldnf1sh, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, LoadVectorInt16NonFaultingSignExtendToInt64, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ldnf1sh, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, LoadVectorInt16NonFaultingSignExtendToUInt32, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ldnf1sh, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) @@ -237,6 +246,7 @@ HARDWARE_INTRINSIC(Sve, Scatter32BitNarrowing, HARDWARE_INTRINSIC(Sve, Scatter32BitWithByteOffsetsNarrowing, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_st1w, INS_sve_st1w, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, Scatter8BitNarrowing, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_st1b, INS_sve_st1b, INS_sve_st1b, INS_sve_st1b, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, Scatter8BitWithByteOffsetsNarrowing, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_st1b, INS_sve_st1b, INS_sve_st1b, INS_sve_st1b, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation) +HARDWARE_INTRINSIC(Sve, SetFfr, -1, 1, true, {INS_sve_wrffr, INS_sve_wrffr, INS_sve_wrffr, INS_sve_wrffr, INS_sve_wrffr, INS_sve_wrffr, INS_sve_wrffr, INS_sve_wrffr, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialSideEffectMask|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Sve, ShiftLeftLogical, -1, -1, false, {INS_sve_lsl, INS_sve_lsl, INS_sve_lsl, INS_sve_lsl, INS_sve_lsl, INS_sve_lsl, INS_sve_lsl, INS_sve_lsl, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasRMWSemantics) HARDWARE_INTRINSIC(Sve, ShiftRightArithmetic, -1, -1, false, {INS_sve_asr, INS_invalid, INS_sve_asr, INS_invalid, INS_sve_asr, INS_invalid, INS_sve_asr, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasRMWSemantics) HARDWARE_INTRINSIC(Sve, ShiftRightArithmeticForDivide, -1, -1, false, {INS_sve_asrd, INS_invalid, INS_sve_asrd, INS_invalid, INS_sve_asrd, INS_invalid, INS_sve_asrd, INS_invalid, INS_invalid, INS_invalid}, HW_Category_ShiftRightByImmediate, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_HasImmediateOperand) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 62d3769879b20..bbef4c47bbbd5 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -63,6 +63,9 @@ void Compiler::lvaInit() #endif // JIT32_GCENCODER lvaNewObjArrayArgs = BAD_VAR_NUM; lvaGSSecurityCookie = BAD_VAR_NUM; +#ifdef TARGET_ARM64 + lvaFfrRegister = BAD_VAR_NUM; +#endif #ifdef TARGET_X86 lvaVarargsBaseOfStkArgs = BAD_VAR_NUM; #endif // TARGET_X86 diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index cd5032921804d..1092fdd9ee316 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1518,6 +1518,73 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) break; case NI_Sve_ConditionalSelect: return LowerHWIntrinsicCndSel(node); + case NI_Sve_SetFfr: + { + // Create physReg FFR definition to store FFR register. + unsigned lclNum = comp->getFFRegisterVarNum(); + GenTree* ffrReg = comp->gtNewPhysRegNode(REG_FFR, TYP_MASK); + GenTree* storeLclVar = comp->gtNewStoreLclVarNode(lclNum, ffrReg); + BlockRange().InsertAfter(node, ffrReg, storeLclVar); + + break; + } + case NI_Sve_GetFfrByte: + case NI_Sve_GetFfrInt16: + case NI_Sve_GetFfrInt32: + case NI_Sve_GetFfrInt64: + case NI_Sve_GetFfrSByte: + case NI_Sve_GetFfrUInt16: + case NI_Sve_GetFfrUInt32: + case NI_Sve_GetFfrUInt64: + { + unsigned lclNum = comp->lvaFfrRegister; + if (lclNum != BAD_VAR_NUM) + { + // Consume the FFR register value from local variable to simulate "use" of FFR, + // only if we saw its definition earlier. + GenTree* lclVar = comp->gtNewLclvNode(lclNum, TYP_MASK); + BlockRange().InsertBefore(node, lclVar); + LowerNode(lclVar); + + node->ResetHWIntrinsicId(intrinsicId, lclVar); + } + break; + } + case NI_Sve_LoadVectorFirstFaulting: + { + LIR::Use use; + bool foundUse = BlockRange().TryGetUse(node, &use); + + unsigned lclNum = comp->lvaFfrRegister; + if (lclNum != BAD_VAR_NUM) + { + // Consume the FFR register value from local variable to simulate "use" of FFR, + // only if we saw its definition earlier. + GenTree* lclVar = comp->gtNewLclvNode(lclNum, TYP_MASK); + BlockRange().InsertBefore(node, lclVar); + LowerNode(lclVar); + + node->ResetHWIntrinsicId(intrinsicId, comp, node->Op(1), node->Op(2), lclVar); + } + + if (foundUse) + { + unsigned tmpNum = comp->lvaGrabTemp(true DEBUGARG("Return value result/FFR")); + LclVarDsc* tmpVarDsc = comp->lvaGetDesc(tmpNum); + tmpVarDsc->lvType = node->TypeGet(); + GenTree* storeLclVar; + use.ReplaceWithLclVar(comp, tmpNum, &storeLclVar); + } + + // Create physReg FFR definition to store FFR register. + lclNum = comp->getFFRegisterVarNum(); + GenTree* ffrReg = comp->gtNewPhysRegNode(REG_FFR, TYP_MASK); + GenTree* storeFfrLclVar = comp->gtNewStoreLclVarNode(lclNum, ffrReg); + + BlockRange().InsertAfter(node, ffrReg, storeFfrLclVar); + + break; + } default: break; } diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 5d6cf7f1945e4..ad28f82973540 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -607,9 +607,16 @@ int LinearScan::BuildNode(GenTree* tree) switch (tree->OperGet()) { default: + { srcCount = BuildSimple(tree); break; - + } + case GT_PHYSREG: + { + srcCount = 0; + BuildDef(tree, getSingleTypeRegMask(tree->AsPhysReg()->gtSrcReg, TYP_MASK)); + break; + } case GT_LCL_VAR: // We make a final determination about whether a GT_LCL_VAR is a candidate or contained // after liveness. In either case we don't build any uses or defs. Otherwise, this is a diff --git a/src/coreclr/jit/registerarm64.h b/src/coreclr/jit/registerarm64.h index d296ab9497858..d099493535f43 100644 --- a/src/coreclr/jit/registerarm64.h +++ b/src/coreclr/jit/registerarm64.h @@ -115,7 +115,7 @@ REGDEF(P12, 12+PBASE, PMASK(12), "p12", "na") REGDEF(P13, 13+PBASE, PMASK(13), "p13", "na") REGDEF(P14, 14+PBASE, PMASK(14), "p14", "na") REGDEF(P15, 15+PBASE, PMASK(15), "p15", "na") - +REGALIAS(FFR, P14) // The registers with values 80 (NBASE) and above are not real register numbers #define NBASE 80 diff --git a/src/coreclr/jit/simd.cpp b/src/coreclr/jit/simd.cpp index 1157bf9e5bfc9..94563113562e2 100644 --- a/src/coreclr/jit/simd.cpp +++ b/src/coreclr/jit/simd.cpp @@ -130,6 +130,29 @@ unsigned Compiler::getSIMDInitTempVarNum(var_types simdType) return lvaSIMDInitTempVarNum; } +#ifdef TARGET_ARM64 +//------------------------------------------------------------------------ +// Get, and allocate if necessary, the SIMD temp used for various operations. +// The temp is allocated as the maximum sized type of all operations required. +// +// Arguments: +// simdType - Required SIMD type +// +// Returns: +// The temp number +// +unsigned Compiler::getFFRegisterVarNum() +{ + if (lvaFfrRegister == BAD_VAR_NUM) + { + lvaFfrRegister = lvaGrabTempWithImplicitUse(false DEBUGARG("Save the FFR value.")); + lvaTable[lvaFfrRegister].lvType = TYP_MASK; + lvaTable[lvaFfrRegister].lvUsedInSIMDIntrinsic = true; + } + return lvaFfrRegister; +} +#endif + //---------------------------------------------------------------------------------- // Return the base type and size of SIMD vector type given its type handle. // From 10cf342a2232f6f0d642efa6540c618dbccecbf3 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Tue, 23 Jul 2024 12:20:18 -0700 Subject: [PATCH 02/14] Change condition for PhysReg --- src/coreclr/jit/lsraarm64.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index ad28f82973540..2352fa5ec0239 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -614,7 +614,15 @@ int LinearScan::BuildNode(GenTree* tree) case GT_PHYSREG: { srcCount = 0; - BuildDef(tree, getSingleTypeRegMask(tree->AsPhysReg()->gtSrcReg, TYP_MASK)); + if (varTypeIsMask(tree)) + { + assert(tree->AsPhysReg()->gtSrcReg == REG_FFR); + BuildDef(tree, getSingleTypeRegMask(tree->AsPhysReg()->gtSrcReg, TYP_MASK)); + } + else + { + BuildSimple(tree); + } break; } case GT_LCL_VAR: From e7507bb1372e35f0f30d73c251e41dd9da5d0b77 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Tue, 23 Jul 2024 12:20:44 -0700 Subject: [PATCH 03/14] jit format --- src/coreclr/jit/codegenarm64.cpp | 2 +- src/coreclr/jit/lclvars.cpp | 2 +- src/coreclr/jit/lowerarmarch.cpp | 6 +++--- src/coreclr/jit/simd.cpp | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index bb161856a3ca1..eebb60272a587 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -2961,7 +2961,7 @@ void CodeGen::genCodeForStoreLclVar(GenTreeLclVar* lclNode) // We are about to store the FFR on stack. So first read the FFR register and // store it in op1Reg. After that, op1Reg will be stored on the stack. assert(varDsc->lvImplicitlyReferenced == 1); - + regNumber op1Reg = data->GetRegNum(); assert(op1Reg != REG_NA); GetEmitter()->emitIns_R(INS_sve_rdffr, EA_SCALABLE, op1Reg); diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index bbef4c47bbbd5..4dc8fec909def 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -64,7 +64,7 @@ void Compiler::lvaInit() lvaNewObjArrayArgs = BAD_VAR_NUM; lvaGSSecurityCookie = BAD_VAR_NUM; #ifdef TARGET_ARM64 - lvaFfrRegister = BAD_VAR_NUM; + lvaFfrRegister = BAD_VAR_NUM; #endif #ifdef TARGET_X86 lvaVarargsBaseOfStkArgs = BAD_VAR_NUM; diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 1092fdd9ee316..a7751df89fbfb 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1521,8 +1521,8 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) case NI_Sve_SetFfr: { // Create physReg FFR definition to store FFR register. - unsigned lclNum = comp->getFFRegisterVarNum(); - GenTree* ffrReg = comp->gtNewPhysRegNode(REG_FFR, TYP_MASK); + unsigned lclNum = comp->getFFRegisterVarNum(); + GenTree* ffrReg = comp->gtNewPhysRegNode(REG_FFR, TYP_MASK); GenTree* storeLclVar = comp->gtNewStoreLclVarNode(lclNum, ffrReg); BlockRange().InsertAfter(node, ffrReg, storeLclVar); @@ -1565,7 +1565,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) LowerNode(lclVar); node->ResetHWIntrinsicId(intrinsicId, comp, node->Op(1), node->Op(2), lclVar); - } + } if (foundUse) { diff --git a/src/coreclr/jit/simd.cpp b/src/coreclr/jit/simd.cpp index 94563113562e2..74c16fafff987 100644 --- a/src/coreclr/jit/simd.cpp +++ b/src/coreclr/jit/simd.cpp @@ -145,10 +145,10 @@ unsigned Compiler::getFFRegisterVarNum() { if (lvaFfrRegister == BAD_VAR_NUM) { - lvaFfrRegister = lvaGrabTempWithImplicitUse(false DEBUGARG("Save the FFR value.")); + lvaFfrRegister = lvaGrabTempWithImplicitUse(false DEBUGARG("Save the FFR value.")); lvaTable[lvaFfrRegister].lvType = TYP_MASK; lvaTable[lvaFfrRegister].lvUsedInSIMDIntrinsic = true; - } + } return lvaFfrRegister; } #endif From b23fac73f7413508a5410e4687f932abfb33834f Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Tue, 23 Jul 2024 14:26:02 -0700 Subject: [PATCH 04/14] review feedback --- src/coreclr/jit/codegenarm64.cpp | 14 -------------- src/coreclr/jit/codegenarmarch.cpp | 14 ++++++++++++-- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index eebb60272a587..2f6ebaf56fc84 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -2954,20 +2954,6 @@ void CodeGen::genCodeForStoreLclVar(GenTreeLclVar* lclNode) } #endif // FEATURE_SIMD -#ifdef FEATURE_MASKED_HW_INTRINSICS - if ((compiler->lvaFfrRegister == lclNode->GetLclNum()) && (targetReg == REG_NA)) - { - assert(targetType == TYP_MASK); - // We are about to store the FFR on stack. So first read the FFR register and - // store it in op1Reg. After that, op1Reg will be stored on the stack. - assert(varDsc->lvImplicitlyReferenced == 1); - - regNumber op1Reg = data->GetRegNum(); - assert(op1Reg != REG_NA); - GetEmitter()->emitIns_R(INS_sve_rdffr, EA_SCALABLE, op1Reg); - } -#endif - genConsumeRegs(data); regNumber dataReg = REG_NA; diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index 16cad5618112b..0f9104f171661 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -1508,8 +1508,18 @@ void CodeGen::genCodeForPhysReg(GenTreePhysReg* tree) var_types targetType = tree->TypeGet(); regNumber targetReg = tree->GetRegNum(); - inst_Mov(targetType, targetReg, tree->gtSrcReg, /* canSkip */ true); - genTransferRegGCState(targetReg, tree->gtSrcReg); +#ifdef TARGET_ARM64 + if (varTypeIsMask(targetType)) + { + assert(tree->gtSrcReg == REG_FFR); + GetEmitter()->emitIns_R(INS_sve_rdffr, EA_SCALABLE, REG_FFR); + } + else +#endif + { + inst_Mov(targetType, targetReg, tree->gtSrcReg, /* canSkip */ true); + genTransferRegGCState(targetReg, tree->gtSrcReg); + } genProduceReg(tree); } From 0c8b68869d8e878536211352deebb60f4e965fff Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Tue, 23 Jul 2024 14:41:39 -0700 Subject: [PATCH 05/14] unspill for LoadVectorFirstFaulting as well --- src/coreclr/jit/hwintrinsiccodegenarm64.cpp | 30 ++++++++++++++++----- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index 4141839958c80..7cb147c4ecfde 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -320,7 +320,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) emitSize = EA_SCALABLE; opt = emitter::optGetSveInsOpt(emitTypeSize(intrin.baseType)); - bool readsFfr = false; switch (intrin.id) { case NI_Sve_GetFfrByte: @@ -331,17 +330,28 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) case NI_Sve_GetFfrUInt16: case NI_Sve_GetFfrUInt32: case NI_Sve_GetFfrUInt64: + { + if ((intrin.op1 != nullptr) && ((intrin.op1->gtFlags & GTF_SPILLED) != 0)) + { + // If there was a op1 for this intrinsic, it means FFR is consumed here + // and we need to unspill. + unspilledFfr = true; + } + break; + } case NI_Sve_LoadVectorFirstFaulting: - readsFfr = true; + { + if ((intrin.op3 != nullptr) && ((intrin.op3->gtFlags & GTF_SPILLED) != 0)) + { + // If there was a op3 for this intrinsic, it means FFR is consumed here + // and we need to unspill. + unspilledFfr = true; + } break; + } default: break; } - - if (readsFfr && (intrin.op1 != nullptr) && ((intrin.op1->gtFlags & GTF_SPILLED) != 0)) - { - unspilledFfr = true; - } } else if (intrin.category == HW_Category_Special) { @@ -2392,6 +2402,12 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) case NI_Sve_LoadVectorFirstFaulting: { + if (unspilledFfr) + { + // We have unspilled the FFR in op1Reg. Restore it back in FFR register. + GetEmitter()->emitIns_R(INS_sve_wrffr, emitSize, op1Reg, opt); + } + insScalableOpts sopt = (opt == INS_OPTS_SCALABLE_B) ? INS_SCALABLE_OPTS_NONE : INS_SCALABLE_OPTS_LSL_N; GetEmitter()->emitIns_R_R_R_R(ins, emitSize, targetReg, op1Reg, op2Reg, REG_ZR, opt, sopt); break; From 36984d0ed80c970722e91b2dde7b35b1acc84734 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Tue, 23 Jul 2024 15:19:41 -0700 Subject: [PATCH 06/14] Use the right opReg --- src/coreclr/jit/hwintrinsiccodegenarm64.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index 7cb147c4ecfde..27d94ea7647a6 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -2405,7 +2405,8 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) if (unspilledFfr) { // We have unspilled the FFR in op1Reg. Restore it back in FFR register. - GetEmitter()->emitIns_R(INS_sve_wrffr, emitSize, op1Reg, opt); + assert(op3Reg != REG_NA); + GetEmitter()->emitIns_R(INS_sve_wrffr, emitSize, op3Reg, opt); } insScalableOpts sopt = (opt == INS_OPTS_SCALABLE_B) ? INS_SCALABLE_OPTS_NONE : INS_SCALABLE_OPTS_LSL_N; @@ -2425,6 +2426,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) if (unspilledFfr) { // We have unspilled the FFR in op1Reg. Restore it back in FFR register. + assert(op1Reg != REG_NA); GetEmitter()->emitIns_R(INS_sve_wrffr, emitSize, op1Reg, opt); } From c3d90dcb283a22caa11052a76ff97039ded1e363 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Wed, 24 Jul 2024 22:33:26 -0700 Subject: [PATCH 07/14] skip spilling tracking --- src/coreclr/jit/gentree.cpp | 14 +- src/coreclr/jit/hwintrinsiccodegenarm64.cpp | 57 +------- src/coreclr/jit/instr.cpp | 2 +- src/coreclr/jit/lower.cpp | 4 + src/coreclr/jit/lower.h | 3 + src/coreclr/jit/lowerarmarch.cpp | 139 +++++++++++++++----- 6 files changed, 125 insertions(+), 94 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index c7c1cd549d9f8..eb5966cde5062 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -11537,6 +11537,14 @@ void Compiler::gtGetLclVarNameInfo(unsigned lclNum, const char** ilKindOut, cons ilKind = "cse"; ilNum = lclNum - optCSEstart; } +#ifdef TARGET_ARM64 + else if (lclNum == lvaFfrRegister) + { + // We introduce this LclVar in lowering, hence special case the printing of + // it instead of handling it in "rationalizer" below. + ilName = "FFReg"; + } +#endif else if (lclNum >= optCSEstart) { // Currently any new LclVar's introduced after the CSE phase @@ -11558,12 +11566,6 @@ void Compiler::gtGetLclVarNameInfo(unsigned lclNum, const char** ilKindOut, cons { ilName = "GsCookie"; } -#ifdef TARGET_ARM64 - else if (lclNum == lvaFfrRegister) - { - ilName = "FFReg"; - } -#endif else if (lclNum == lvaRetAddrVar) { ilName = "ReturnAddress"; diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index 27d94ea7647a6..934ed32b81d75 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -303,7 +303,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) emitAttr emitSize; insOpts opt; - bool unspilledFfr = false; if (HWIntrinsicInfo::SIMDScalar(intrin.id)) { @@ -319,39 +318,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) { emitSize = EA_SCALABLE; opt = emitter::optGetSveInsOpt(emitTypeSize(intrin.baseType)); - - switch (intrin.id) - { - case NI_Sve_GetFfrByte: - case NI_Sve_GetFfrInt16: - case NI_Sve_GetFfrInt32: - case NI_Sve_GetFfrInt64: - case NI_Sve_GetFfrSByte: - case NI_Sve_GetFfrUInt16: - case NI_Sve_GetFfrUInt32: - case NI_Sve_GetFfrUInt64: - { - if ((intrin.op1 != nullptr) && ((intrin.op1->gtFlags & GTF_SPILLED) != 0)) - { - // If there was a op1 for this intrinsic, it means FFR is consumed here - // and we need to unspill. - unspilledFfr = true; - } - break; - } - case NI_Sve_LoadVectorFirstFaulting: - { - if ((intrin.op3 != nullptr) && ((intrin.op3->gtFlags & GTF_SPILLED) != 0)) - { - // If there was a op3 for this intrinsic, it means FFR is consumed here - // and we need to unspill. - unspilledFfr = true; - } - break; - } - default: - break; - } } else if (intrin.category == HW_Category_Special) { @@ -2402,9 +2368,9 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) case NI_Sve_LoadVectorFirstFaulting: { - if (unspilledFfr) + if (intrin.numOperands == 3) { - // We have unspilled the FFR in op1Reg. Restore it back in FFR register. + // We have extra argument which means there is a "use" of FFR here. Restore it back in FFR register. assert(op3Reg != REG_NA); GetEmitter()->emitIns_R(INS_sve_wrffr, emitSize, op3Reg, opt); } @@ -2414,25 +2380,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) break; } - case NI_Sve_GetFfrByte: - case NI_Sve_GetFfrInt16: - case NI_Sve_GetFfrInt32: - case NI_Sve_GetFfrInt64: - case NI_Sve_GetFfrSByte: - case NI_Sve_GetFfrUInt16: - case NI_Sve_GetFfrUInt32: - case NI_Sve_GetFfrUInt64: - { - if (unspilledFfr) - { - // We have unspilled the FFR in op1Reg. Restore it back in FFR register. - assert(op1Reg != REG_NA); - GetEmitter()->emitIns_R(INS_sve_wrffr, emitSize, op1Reg, opt); - } - - GetEmitter()->emitIns_R(ins, emitSize, targetReg, INS_OPTS_SCALABLE_B); - break; - } case NI_Sve_SetFfr: { assert(targetReg == REG_NA); diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index f31e8b364c6ed..17a080b740cdc 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -721,7 +721,7 @@ void CodeGen::inst_TT_RV(instruction ins, emitAttr size, GenTree* tree, regNumbe unsigned varNum = tree->AsLclVarCommon()->GetLclNum(); assert(varNum < compiler->lvaCount); #if CPU_LOAD_STORE_ARCH - assert(GetEmitter()->emitInsIsStore(ins)); + assert(GetEmitter()->emitInsIsStore(ins) || (ins == INS_sve_str)); #endif GetEmitter()->emitIns_S_R(ins, size, reg, varNum, 0); } diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index d57d1b893d68a..309eae1f9580c 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -479,6 +479,9 @@ GenTree* Lowering::LowerNode(GenTree* node) { return newNode; } +#ifdef TARGET_ARM64 + m_ffrTrashed = true; +#endif } break; @@ -7902,6 +7905,7 @@ void Lowering::LowerBlock(BasicBlock* block) m_block = block; #ifdef TARGET_ARM64 m_blockIndirs.Reset(); + m_ffrTrashed = true; #endif // NOTE: some of the lowering methods insert calls before the node being diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 81e337abdaf66..cc23902a527b4 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -429,6 +429,8 @@ class Lowering final : public Phase void LowerHWIntrinsicFusedMultiplyAddScalar(GenTreeHWIntrinsic* node); void LowerModPow2(GenTree* node); bool TryLowerAddForPossibleContainment(GenTreeOp* node, GenTree** next); + void CreateFFRDefIfNeeded(GenTreeHWIntrinsic* node); + void StoreFFRValue(GenTreeHWIntrinsic* node); #endif // !TARGET_XARCH && !TARGET_ARM64 GenTree* InsertNewSimdCreateScalarUnsafeNode(var_types type, GenTree* op1, @@ -629,6 +631,7 @@ class Lowering final : public Phase } }; ArrayStack m_blockIndirs; + bool m_ffrTrashed; #endif }; diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index a7751df89fbfb..affa63e7bd81c 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1520,12 +1520,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) return LowerHWIntrinsicCndSel(node); case NI_Sve_SetFfr: { - // Create physReg FFR definition to store FFR register. - unsigned lclNum = comp->getFFRegisterVarNum(); - GenTree* ffrReg = comp->gtNewPhysRegNode(REG_FFR, TYP_MASK); - GenTree* storeLclVar = comp->gtNewStoreLclVarNode(lclNum, ffrReg); - BlockRange().InsertAfter(node, ffrReg, storeLclVar); - + StoreFFRValue(node); break; } case NI_Sve_GetFfrByte: @@ -1537,51 +1532,62 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) case NI_Sve_GetFfrUInt32: case NI_Sve_GetFfrUInt64: { - unsigned lclNum = comp->lvaFfrRegister; - if (lclNum != BAD_VAR_NUM) + LIR::Use use; + bool foundUse = BlockRange().TryGetUse(node, &use); + if (foundUse) { - // Consume the FFR register value from local variable to simulate "use" of FFR, - // only if we saw its definition earlier. - GenTree* lclVar = comp->gtNewLclvNode(lclNum, TYP_MASK); - BlockRange().InsertBefore(node, lclVar); - LowerNode(lclVar); + // Remove GetFfr and replace the GetFfr's use with the lvaFfrRegister lclVar. + CreateFFRDefIfNeeded(node); - node->ResetHWIntrinsicId(intrinsicId, lclVar); + GenTree* lclVar = comp->gtNewLclvNode(comp->lvaFfrRegister, TYP_MASK); + BlockRange().InsertBefore(node, lclVar); + use.ReplaceWith(lclVar); + GenTree* next = node->gtNext; + BlockRange().Remove(node); + return next; } + else + { + node->SetUnusedValue(); + } + break; } case NI_Sve_LoadVectorFirstFaulting: { + unsigned lclNum = comp->lvaFfrRegister; + LIR::Use use; bool foundUse = BlockRange().TryGetUse(node, &use); - unsigned lclNum = comp->lvaFfrRegister; - if (lclNum != BAD_VAR_NUM) + if (foundUse) { - // Consume the FFR register value from local variable to simulate "use" of FFR, - // only if we saw its definition earlier. - GenTree* lclVar = comp->gtNewLclvNode(lclNum, TYP_MASK); - BlockRange().InsertBefore(node, lclVar); - LowerNode(lclVar); + if (m_ffrTrashed) + { + // Consume the FFR register value from local variable to simulate "use" of FFR, + // only if it was trashed. If it was not trashed, we do not have to reload the + // contents of the FFR register. - node->ResetHWIntrinsicId(intrinsicId, comp, node->Op(1), node->Op(2), lclVar); - } + CreateFFRDefIfNeeded(node); - if (foundUse) - { + GenTree* lclVar = comp->gtNewLclvNode(comp->lvaFfrRegister, TYP_MASK); + BlockRange().InsertBefore(node, lclVar); + LowerNode(lclVar); + + node->ResetHWIntrinsicId(intrinsicId, comp, node->Op(1), node->Op(2), lclVar); + } unsigned tmpNum = comp->lvaGrabTemp(true DEBUGARG("Return value result/FFR")); LclVarDsc* tmpVarDsc = comp->lvaGetDesc(tmpNum); tmpVarDsc->lvType = node->TypeGet(); GenTree* storeLclVar; use.ReplaceWithLclVar(comp, tmpNum, &storeLclVar); - } - // Create physReg FFR definition to store FFR register. - lclNum = comp->getFFRegisterVarNum(); - GenTree* ffrReg = comp->gtNewPhysRegNode(REG_FFR, TYP_MASK); - GenTree* storeFfrLclVar = comp->gtNewStoreLclVarNode(lclNum, ffrReg); - - BlockRange().InsertAfter(node, ffrReg, storeFfrLclVar); + StoreFFRValue(node); + } + else + { + node->SetUnusedValue(); + } break; } @@ -3844,6 +3850,75 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* cndSelNode) ContainCheckHWIntrinsic(cndSelNode); return cndSelNode->gtNext; } + +#if defined(TARGET_ARM64) +//---------------------------------------------------------------------------------------------- +// StoreFFRValue: For hwintrinsic that produce a first faulting register (FFR) value, create +// nodes to save its value to a local variable. +// +// Arguments: +// node - The node before which the pseudo definition is needed +// +void Lowering::StoreFFRValue(GenTreeHWIntrinsic* node) +{ +#ifdef DEBUG + switch (node->GetHWIntrinsicId()) + { + case NI_Sve_SetFfr: + case NI_Sve_LoadVectorFirstFaulting: + break; + default: + assert(!"Unexpected HWIntrinsicId"); + } +#endif + + // Create physReg FFR definition to store FFR register. + unsigned lclNum = comp->getFFRegisterVarNum(); + GenTree* ffrReg = comp->gtNewPhysRegNode(REG_FFR, TYP_MASK); + GenTree* storeLclVar = comp->gtNewStoreLclVarNode(lclNum, ffrReg); + BlockRange().InsertAfter(node, ffrReg, storeLclVar); + m_ffrTrashed = false; +} + +//---------------------------------------------------------------------------------------------- +// CreateFfrDefIfNeeded: If there is a use before a proper defintion (through APIs like SetFfr), +// then create a pseudo definition of FFR and store it in a local variable. +// +// Arguments: +// node - The node before which the pseudo definition is needed +// +void Lowering::CreateFFRDefIfNeeded(GenTreeHWIntrinsic* node) +{ +#ifdef DEBUG + switch (node->GetHWIntrinsicId()) + { + case NI_Sve_GetFfrByte: + case NI_Sve_GetFfrInt16: + case NI_Sve_GetFfrInt32: + case NI_Sve_GetFfrInt64: + case NI_Sve_GetFfrSByte: + case NI_Sve_GetFfrUInt16: + case NI_Sve_GetFfrUInt32: + case NI_Sve_GetFfrUInt64: + case NI_Sve_LoadVectorFirstFaulting: + break; + default: + assert(!"Unexpected HWIntrinsicId"); + } +#endif + unsigned lclNum = comp->lvaFfrRegister; + if (lclNum == BAD_VAR_NUM) + { + lclNum = comp->getFFRegisterVarNum(); + GenTree* ffrReg = comp->gtNewPhysRegNode(REG_FFR, TYP_MASK); + GenTree* storeFfrLclVar = comp->gtNewStoreLclVarNode(lclNum, ffrReg); + BlockRange().InsertBefore(node, ffrReg, storeFfrLclVar); + LowerNode(ffrReg); + LowerNode(storeFfrLclVar); + } +} +#endif + #endif // FEATURE_HW_INTRINSICS #endif // TARGET_ARMARCH From 81446bd6ac909628d0a44ef5371f9c89a431fe44 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Thu, 25 Jul 2024 08:56:03 -0700 Subject: [PATCH 08/14] review feedback --- src/coreclr/jit/hwintrinsiclistarm64sve.h | 16 ++++----- src/coreclr/jit/lowerarmarch.cpp | 44 +++++++++++------------ 2 files changed, 28 insertions(+), 32 deletions(-) diff --git a/src/coreclr/jit/hwintrinsiclistarm64sve.h b/src/coreclr/jit/hwintrinsiclistarm64sve.h index 9d9d8521da688..8a531918261a3 100644 --- a/src/coreclr/jit/hwintrinsiclistarm64sve.h +++ b/src/coreclr/jit/hwintrinsiclistarm64sve.h @@ -122,14 +122,14 @@ HARDWARE_INTRINSIC(Sve, GatherVectorUInt32WithByteOffsetsZeroExtend, HARDWARE_INTRINSIC(Sve, GatherVectorUInt32ZeroExtend, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1w, INS_sve_ld1w, INS_sve_ld1w, INS_sve_ld1w, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, GatherVectorWithByteOffsets, -1, 3, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1w, INS_sve_ld1w, INS_sve_ld1d, INS_sve_ld1d, INS_sve_ld1w, INS_sve_ld1d}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, GetActiveElementCount, -1, 2, true, {INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_ExplicitMaskedOperation) -HARDWARE_INTRINSIC(Sve, GetFfrByte, -1, -1, false, {INS_invalid, INS_sve_rdffr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialSideEffectMask) -HARDWARE_INTRINSIC(Sve, GetFfrInt16, -1, -1, false, {INS_invalid, INS_invalid, INS_sve_rdffr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialSideEffectMask) -HARDWARE_INTRINSIC(Sve, GetFfrInt32, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_rdffr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialSideEffectMask) -HARDWARE_INTRINSIC(Sve, GetFfrInt64, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_rdffr, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialSideEffectMask) -HARDWARE_INTRINSIC(Sve, GetFfrSByte, -1, -1, false, {INS_sve_rdffr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialSideEffectMask) -HARDWARE_INTRINSIC(Sve, GetFfrUInt16, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_sve_rdffr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialSideEffectMask) -HARDWARE_INTRINSIC(Sve, GetFfrUInt32, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_rdffr, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialSideEffectMask) -HARDWARE_INTRINSIC(Sve, GetFfrUInt64, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_rdffr, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialSideEffectMask) +HARDWARE_INTRINSIC(Sve, GetFfrByte, -1, -1, false, {INS_invalid, INS_sve_rdffr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, GetFfrInt16, -1, -1, false, {INS_invalid, INS_invalid, INS_sve_rdffr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, GetFfrInt32, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_rdffr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, GetFfrInt64, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_rdffr, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, GetFfrSByte, -1, -1, false, {INS_sve_rdffr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, GetFfrUInt16, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_sve_rdffr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, GetFfrUInt32, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_rdffr, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, GetFfrUInt64, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_rdffr, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(Sve, InsertIntoShiftedVector, -1, 2, true, {INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_HasRMWSemantics) HARDWARE_INTRINSIC(Sve, LeadingSignCount, -1, -1, false, {INS_sve_cls, INS_invalid, INS_sve_cls, INS_invalid, INS_sve_cls, INS_invalid, INS_sve_cls, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, LeadingZeroCount, -1, -1, false, {INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index affa63e7bd81c..f746e4acc9134 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1560,35 +1560,35 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) LIR::Use use; bool foundUse = BlockRange().TryGetUse(node, &use); - if (foundUse) + if (m_ffrTrashed) { - if (m_ffrTrashed) - { - // Consume the FFR register value from local variable to simulate "use" of FFR, - // only if it was trashed. If it was not trashed, we do not have to reload the - // contents of the FFR register. + // Consume the FFR register value from local variable to simulate "use" of FFR, + // only if it was trashed. If it was not trashed, we do not have to reload the + // contents of the FFR register. - CreateFFRDefIfNeeded(node); + CreateFFRDefIfNeeded(node); - GenTree* lclVar = comp->gtNewLclvNode(comp->lvaFfrRegister, TYP_MASK); - BlockRange().InsertBefore(node, lclVar); - LowerNode(lclVar); + GenTree* lclVar = comp->gtNewLclvNode(comp->lvaFfrRegister, TYP_MASK); + BlockRange().InsertBefore(node, lclVar); + LowerNode(lclVar); - node->ResetHWIntrinsicId(intrinsicId, comp, node->Op(1), node->Op(2), lclVar); - } + node->ResetHWIntrinsicId(intrinsicId, comp, node->Op(1), node->Op(2), lclVar); + } + + if (foundUse) + { unsigned tmpNum = comp->lvaGrabTemp(true DEBUGARG("Return value result/FFR")); LclVarDsc* tmpVarDsc = comp->lvaGetDesc(tmpNum); tmpVarDsc->lvType = node->TypeGet(); GenTree* storeLclVar; use.ReplaceWithLclVar(comp, tmpNum, &storeLclVar); - - StoreFFRValue(node); } else { node->SetUnusedValue(); } + StoreFFRValue(node); break; } default: @@ -3906,16 +3906,12 @@ void Lowering::CreateFFRDefIfNeeded(GenTreeHWIntrinsic* node) assert(!"Unexpected HWIntrinsicId"); } #endif - unsigned lclNum = comp->lvaFfrRegister; - if (lclNum == BAD_VAR_NUM) - { - lclNum = comp->getFFRegisterVarNum(); - GenTree* ffrReg = comp->gtNewPhysRegNode(REG_FFR, TYP_MASK); - GenTree* storeFfrLclVar = comp->gtNewStoreLclVarNode(lclNum, ffrReg); - BlockRange().InsertBefore(node, ffrReg, storeFfrLclVar); - LowerNode(ffrReg); - LowerNode(storeFfrLclVar); - } + unsigned lclNum = comp->getFFRegisterVarNum(); + GenTree* ffrReg = comp->gtNewPhysRegNode(REG_FFR, TYP_MASK); + GenTree* storeFfrLclVar = comp->gtNewStoreLclVarNode(lclNum, ffrReg); + BlockRange().InsertBefore(node, ffrReg, storeFfrLclVar); + LowerNode(ffrReg); + LowerNode(storeFfrLclVar); } #endif From 06fe8d99eb92b777722c699921d8f0b9bb53305d Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Wed, 24 Jul 2024 23:07:49 -0700 Subject: [PATCH 09/14] Use non-existent REG_FFR --- src/coreclr/jit/codegenarmarch.cpp | 2 +- src/coreclr/jit/lsra.h | 2 +- src/coreclr/jit/lsrabuild.cpp | 5 +---- src/coreclr/jit/registerarm64.h | 4 ++-- 4 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index 0f9104f171661..b0d5d5727fc50 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -1512,7 +1512,7 @@ void CodeGen::genCodeForPhysReg(GenTreePhysReg* tree) if (varTypeIsMask(targetType)) { assert(tree->gtSrcReg == REG_FFR); - GetEmitter()->emitIns_R(INS_sve_rdffr, EA_SCALABLE, REG_FFR); + GetEmitter()->emitIns_R(INS_sve_rdffr, EA_SCALABLE, targetReg); } else #endif diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 59963e02d7438..3d86fa3e9b6cc 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -511,7 +511,7 @@ class RegRecord : public Referenceable #if defined(FEATURE_MASKED_HW_INTRINSICS) else { - assert(emitter::isMaskReg(reg)); + assert(emitter::isMaskReg(reg) || (reg == REG_FFR)); registerType = MaskRegisterType; } #endif // FEATURE_MASKED_HW_INTRINSICS diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 46577c8050fef..ec975e1b5b3fc 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -2841,10 +2841,7 @@ void LinearScan::buildIntervals() #ifdef HAS_MORE_THAN_64_REGISTERS else if (availableRegCount < (sizeof(regMaskTP) * 8)) { - // Mask out the bits that are between (8 * regMaskTP) ~ availableRegCount - // Subtract one extra for stack. - unsigned topRegCount = availableRegCount - sizeof(regMaskSmall) * 8 - 1; - actualRegistersMask = regMaskTP(~RBM_NONE, (1ULL << topRegCount) - 1); + actualRegistersMask = regMaskTP(~RBM_NONE, availableMaskRegs); } #endif else diff --git a/src/coreclr/jit/registerarm64.h b/src/coreclr/jit/registerarm64.h index d099493535f43..6b8091814251e 100644 --- a/src/coreclr/jit/registerarm64.h +++ b/src/coreclr/jit/registerarm64.h @@ -115,14 +115,14 @@ REGDEF(P12, 12+PBASE, PMASK(12), "p12", "na") REGDEF(P13, 13+PBASE, PMASK(13), "p13", "na") REGDEF(P14, 14+PBASE, PMASK(14), "p14", "na") REGDEF(P15, 15+PBASE, PMASK(15), "p15", "na") -REGALIAS(FFR, P14) // The registers with values 80 (NBASE) and above are not real register numbers #define NBASE 80 REGDEF(SP, 0+NBASE, 0x0000, "sp", "wsp?") +REGDEF(FFR, 1+NBASE, 0x0000, "ffr", "na") // This must be last! -REGDEF(STK, 1+NBASE, 0x0000, "STK", "STK") +REGDEF(STK, 2+NBASE, 0x0000, "STK", "STK") /*****************************************************************************/ #undef RMASK From b685efd210f3a7b5bdcc115acb0de955d3bd0edc Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Thu, 25 Jul 2024 12:31:37 -0700 Subject: [PATCH 10/14] Do not reload from FFR for GetFfr() --- src/coreclr/jit/lowerarmarch.cpp | 10 +++------- src/coreclr/jit/lsrabuild.cpp | 2 +- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index f746e4acc9134..9e1649f720097 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1536,10 +1536,8 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) bool foundUse = BlockRange().TryGetUse(node, &use); if (foundUse) { - // Remove GetFfr and replace the GetFfr's use with the lvaFfrRegister lclVar. - CreateFFRDefIfNeeded(node); - - GenTree* lclVar = comp->gtNewLclvNode(comp->lvaFfrRegister, TYP_MASK); + unsigned lclNum = comp->getFFRegisterVarNum(); + GenTree* lclVar = comp->gtNewLclvNode(lclNum, TYP_MASK); BlockRange().InsertBefore(node, lclVar); use.ReplaceWith(lclVar); GenTree* next = node->gtNext; @@ -1555,8 +1553,6 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) } case NI_Sve_LoadVectorFirstFaulting: { - unsigned lclNum = comp->lvaFfrRegister; - LIR::Use use; bool foundUse = BlockRange().TryGetUse(node, &use); @@ -3906,7 +3902,7 @@ void Lowering::CreateFFRDefIfNeeded(GenTreeHWIntrinsic* node) assert(!"Unexpected HWIntrinsicId"); } #endif - unsigned lclNum = comp->getFFRegisterVarNum(); + unsigned lclNum = comp->getFFRegisterVarNum(); GenTree* ffrReg = comp->gtNewPhysRegNode(REG_FFR, TYP_MASK); GenTree* storeFfrLclVar = comp->gtNewStoreLclVarNode(lclNum, ffrReg); BlockRange().InsertBefore(node, ffrReg, storeFfrLclVar); diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index ec975e1b5b3fc..ee6d423db46d9 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -2841,7 +2841,7 @@ void LinearScan::buildIntervals() #ifdef HAS_MORE_THAN_64_REGISTERS else if (availableRegCount < (sizeof(regMaskTP) * 8)) { - actualRegistersMask = regMaskTP(~RBM_NONE, availableMaskRegs); + actualRegistersMask = regMaskTP(~RBM_NONE, availableMaskRegs); } #endif else From ce63d38184db1b61c6af1f0428bae6dcf56b282a Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Thu, 25 Jul 2024 13:38:20 -0700 Subject: [PATCH 11/14] review feedback --- src/coreclr/jit/lower.h | 1 - src/coreclr/jit/lowerarmarch.cpp | 38 +------------------------------- 2 files changed, 1 insertion(+), 38 deletions(-) diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index cc23902a527b4..7785bb48dcd01 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -429,7 +429,6 @@ class Lowering final : public Phase void LowerHWIntrinsicFusedMultiplyAddScalar(GenTreeHWIntrinsic* node); void LowerModPow2(GenTree* node); bool TryLowerAddForPossibleContainment(GenTreeOp* node, GenTree** next); - void CreateFFRDefIfNeeded(GenTreeHWIntrinsic* node); void StoreFFRValue(GenTreeHWIntrinsic* node); #endif // !TARGET_XARCH && !TARGET_ARM64 GenTree* InsertNewSimdCreateScalarUnsafeNode(var_types type, diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 9e1649f720097..50615dd4e4a1b 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1562,8 +1562,6 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) // only if it was trashed. If it was not trashed, we do not have to reload the // contents of the FFR register. - CreateFFRDefIfNeeded(node); - GenTree* lclVar = comp->gtNewLclvNode(comp->lvaFfrRegister, TYP_MASK); BlockRange().InsertBefore(node, lclVar); LowerNode(lclVar); @@ -3875,41 +3873,7 @@ void Lowering::StoreFFRValue(GenTreeHWIntrinsic* node) BlockRange().InsertAfter(node, ffrReg, storeLclVar); m_ffrTrashed = false; } - -//---------------------------------------------------------------------------------------------- -// CreateFfrDefIfNeeded: If there is a use before a proper defintion (through APIs like SetFfr), -// then create a pseudo definition of FFR and store it in a local variable. -// -// Arguments: -// node - The node before which the pseudo definition is needed -// -void Lowering::CreateFFRDefIfNeeded(GenTreeHWIntrinsic* node) -{ -#ifdef DEBUG - switch (node->GetHWIntrinsicId()) - { - case NI_Sve_GetFfrByte: - case NI_Sve_GetFfrInt16: - case NI_Sve_GetFfrInt32: - case NI_Sve_GetFfrInt64: - case NI_Sve_GetFfrSByte: - case NI_Sve_GetFfrUInt16: - case NI_Sve_GetFfrUInt32: - case NI_Sve_GetFfrUInt64: - case NI_Sve_LoadVectorFirstFaulting: - break; - default: - assert(!"Unexpected HWIntrinsicId"); - } -#endif - unsigned lclNum = comp->getFFRegisterVarNum(); - GenTree* ffrReg = comp->gtNewPhysRegNode(REG_FFR, TYP_MASK); - GenTree* storeFfrLclVar = comp->gtNewStoreLclVarNode(lclNum, ffrReg); - BlockRange().InsertBefore(node, ffrReg, storeFfrLclVar); - LowerNode(ffrReg); - LowerNode(storeFfrLclVar); -} -#endif +#endif // TARGET_ARM64 #endif // FEATURE_HW_INTRINSICS From b5851dda269f6f48daef2d536d475ba11cd5691a Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Thu, 25 Jul 2024 14:00:07 -0700 Subject: [PATCH 12/14] Make just GrabTemp --- src/coreclr/jit/simd.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/simd.cpp b/src/coreclr/jit/simd.cpp index 74c16fafff987..d45f7ae735e9a 100644 --- a/src/coreclr/jit/simd.cpp +++ b/src/coreclr/jit/simd.cpp @@ -145,7 +145,7 @@ unsigned Compiler::getFFRegisterVarNum() { if (lvaFfrRegister == BAD_VAR_NUM) { - lvaFfrRegister = lvaGrabTempWithImplicitUse(false DEBUGARG("Save the FFR value.")); + lvaFfrRegister = lvaGrabTemp(false DEBUGARG("Save the FFR value.")); lvaTable[lvaFfrRegister].lvType = TYP_MASK; lvaTable[lvaFfrRegister].lvUsedInSIMDIntrinsic = true; } From c96b8b58086299bcb675193065270d9cc06c8134 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Thu, 25 Jul 2024 15:07:00 -0700 Subject: [PATCH 13/14] fix build and formatting --- src/coreclr/jit/lsra.h | 4 ++++ src/coreclr/jit/simd.cpp | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 3d86fa3e9b6cc..b04af2082c90f 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -511,7 +511,11 @@ class RegRecord : public Referenceable #if defined(FEATURE_MASKED_HW_INTRINSICS) else { +#ifdef TARGET_ARM64 assert(emitter::isMaskReg(reg) || (reg == REG_FFR)); +#else + assert(emitter::isMaskReg(reg)); +#endif registerType = MaskRegisterType; } #endif // FEATURE_MASKED_HW_INTRINSICS diff --git a/src/coreclr/jit/simd.cpp b/src/coreclr/jit/simd.cpp index d45f7ae735e9a..6256ee0c37799 100644 --- a/src/coreclr/jit/simd.cpp +++ b/src/coreclr/jit/simd.cpp @@ -145,8 +145,8 @@ unsigned Compiler::getFFRegisterVarNum() { if (lvaFfrRegister == BAD_VAR_NUM) { - lvaFfrRegister = lvaGrabTemp(false DEBUGARG("Save the FFR value.")); - lvaTable[lvaFfrRegister].lvType = TYP_MASK; + lvaFfrRegister = lvaGrabTemp(false DEBUGARG("Save the FFR value.")); + lvaTable[lvaFfrRegister].lvType = TYP_MASK; lvaTable[lvaFfrRegister].lvUsedInSIMDIntrinsic = true; } return lvaFfrRegister; From 5cdaace7fa750178868ad888684853ecc00d6f87 Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Thu, 25 Jul 2024 15:52:45 -0700 Subject: [PATCH 14/14] missed another build failure for arm --- src/coreclr/jit/instr.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 17a080b740cdc..bc0300c83a846 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -721,7 +721,13 @@ void CodeGen::inst_TT_RV(instruction ins, emitAttr size, GenTree* tree, regNumbe unsigned varNum = tree->AsLclVarCommon()->GetLclNum(); assert(varNum < compiler->lvaCount); #if CPU_LOAD_STORE_ARCH +#ifdef TARGET_ARM64 + // Workaround until https://github.com/dotnet/runtime/issues/105512 is fixed. assert(GetEmitter()->emitInsIsStore(ins) || (ins == INS_sve_str)); +#else + assert(GetEmitter()->emitInsIsStore(ins)); +#endif + #endif GetEmitter()->emitIns_S_R(ins, size, reg, varNum, 0); }