diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 0f91f798bb946..aee8ecc0c4e59 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -476,8 +476,6 @@ class CodeGen final : public CodeGenInterface // Save/Restore callee saved float regs to stack void genPreserveCalleeSavedFltRegs(unsigned lclFrameSize); void genRestoreCalleeSavedFltRegs(unsigned lclFrameSize); - // Generate VZeroupper instruction to avoid AVX/SSE transition penalty - void genVzeroupperIfNeeded(bool check256bitOnly = true); #endif // TARGET_XARCH diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 03a255d98078a..b877262b1583d 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -6074,16 +6074,18 @@ void CodeGen::genCall(GenTreeCall* call) } #endif // defined(DEBUG) && defined(TARGET_X86) - // When it's a PInvoke call and the call type is USER function, we issue VZEROUPPER here - // if the function contains 256bit AVX instructions, this is to avoid AVX-256 to Legacy SSE - // transition penalty, assuming the user function contains legacy SSE instruction. - // To limit code size increase impact: we only issue VZEROUPPER before PInvoke call, not issue - // VZEROUPPER after PInvoke call because transition penalty from legacy SSE to AVX only happens - // when there's preceding 256-bit AVX to legacy SSE transition penalty. - // This applies to 512bit AVX512 instructions as well. - if (call->IsPInvoke() && (call->gtCallType == CT_USER_FUNC) && (GetEmitter()->Contains256bitOrMoreAVX())) - { - assert(compiler->canUseVexEncoding()); + if (GetEmitter()->Contains256bitOrMoreAVX() && call->NeedsVzeroupper(compiler)) + { + // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states: + // Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean + // between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a + // VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX + // register) and before any call to an unknown function. + + // This method contains a call that needs vzeroupper but also uses 256-bit or higher + // AVX itself. This means we couldn't optimize to only emitting a single vzeroupper in + // the method prologue and instead need to insert one before each call that needs it. + instGen(INS_vzeroupper); } @@ -11188,12 +11190,27 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu // funclet frames: this will be FuncletInfo.fiSpDelta. void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize) { - genVzeroupperIfNeeded(false); regMaskTP regMask = compiler->compCalleeFPRegsSavedMask; // Only callee saved floating point registers should be in regMask assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask); + if (GetEmitter()->ContainsCallNeedingVzeroupper() && !GetEmitter()->Contains256bitOrMoreAVX()) + { + // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states: + // Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean + // between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a + // VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX + // register) and before any call to an unknown function. + + // This method contains a call that needs vzeroupper but also doesn't use 256-bit or higher + // AVX itself. Thus we can optimize to only emitting a single vzeroupper in the function prologue + // This reduces the overall amount of codegen, particularly for more common paths not using any + // SIMD or floating-point. + + instGen(INS_vzeroupper); + } + // fast path return if (regMask == RBM_NONE) { @@ -11241,10 +11258,20 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize) // Only callee saved floating point registers should be in regMask assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask); + if (GetEmitter()->Contains256bitOrMoreAVX()) + { + // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states: + // Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean + // between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a + // VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX + // register) and before any call to an unknown function. + + instGen(INS_vzeroupper); + } + // fast path return if (regMask == RBM_NONE) { - genVzeroupperIfNeeded(); return; } @@ -11287,37 +11314,6 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize) offset -= XMM_REGSIZE_BYTES; } } - genVzeroupperIfNeeded(); -} - -// Generate Vzeroupper instruction as needed to zero out upper 128b-bit of all YMM registers so that the -// AVX/Legacy SSE transition penalties can be avoided. This function is been used in genPreserveCalleeSavedFltRegs -// (prolog) and genRestoreCalleeSavedFltRegs (epilog). Issue VZEROUPPER in Prolog if the method contains -// 128-bit or 256-bit AVX code, to avoid legacy SSE to AVX transition penalty, which could happen when native -// code contains legacy SSE code calling into JIT AVX code (e.g. reverse pinvoke). Issue VZEROUPPER in Epilog -// if the method contains 256-bit AVX code, to avoid AVX to legacy SSE transition penalty. -// -// Params -// check256bitOnly - true to check if the function contains 256-bit AVX instruction and generate Vzeroupper -// instruction, false to check if the function contains AVX instruction (either 128-bit or 256-bit). -// -void CodeGen::genVzeroupperIfNeeded(bool check256bitOnly /* = true*/) -{ - bool emitVzeroUpper = false; - if (check256bitOnly) - { - emitVzeroUpper = GetEmitter()->Contains256bitOrMoreAVX(); - } - else - { - emitVzeroUpper = GetEmitter()->ContainsAVX(); - } - - if (emitVzeroUpper) - { - assert(compiler->canUseVexEncoding()); - instGen(INS_vzeroupper); - } } //----------------------------------------------------------------------------------- diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index a4d15fe5706b3..77e2e1d184c27 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -2312,6 +2312,7 @@ void Compiler::compSetProcessor() // Assume each JITted method does not contain AVX instruction at first codeGen->GetEmitter()->SetContainsAVX(false); codeGen->GetEmitter()->SetContains256bitOrMoreAVX(false); + codeGen->GetEmitter()->SetContainsCallNeedingVzeroupper(false); } if (canUseEvexEncoding()) { diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 1d4dbcdb82aa8..07d1ff60aee96 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -9383,6 +9383,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX } #ifdef TARGET_XARCH +public: bool canUseVexEncoding() const { return compOpportunisticallyDependsOn(InstructionSet_AVX); @@ -9399,6 +9400,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX return compOpportunisticallyDependsOn(InstructionSet_AVX512F); } +private: //------------------------------------------------------------------------ // DoJitStressEvexEncoding- Answer the question: Do we force EVEX encoding. // diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index d842f91f06a5d..2877346ab4fd3 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -468,6 +468,16 @@ void SetContains256bitOrMoreAVX(bool value) contains256bitOrMoreAVXInstruction = value; } +bool containsCallNeedingVzeroupper = false; +bool ContainsCallNeedingVzeroupper() const +{ + return containsCallNeedingVzeroupper; +} +void SetContainsCallNeedingVzeroupper(bool value) +{ + containsCallNeedingVzeroupper = value; +} + bool IsDstDstSrcAVXInstruction(instruction ins) const; bool IsDstSrcSrcAVXInstruction(instruction ins) const; bool IsThreeOperandAVXInstruction(instruction ins) const; diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 58c28f322f375..6d41f802ef3d4 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -2076,6 +2076,75 @@ void CallArgs::Remove(CallArg* arg) assert(!"Did not find arg to remove in CallArgs::Remove"); } +#ifdef TARGET_XARCH +//--------------------------------------------------------------- +// NeedsVzeroupper: Determines if the call needs a vzeroupper emitted before it is invoked +// +// Parameters: +// comp - the compiler +// +// Returns: +// true if a vzeroupper needs to be emitted; otherwise, false +// +bool GenTreeCall::NeedsVzeroupper(Compiler* comp) +{ + bool needsVzeroupper = false; + + if (IsPInvoke() && comp->canUseVexEncoding()) + { + // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states: + // Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean + // between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a + // VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX + // register) and before any call to an unknown function. + + switch (gtCallType) + { + case CT_USER_FUNC: + case CT_INDIRECT: + { + // Since P/Invokes are not compiled by the runtime, they are typically "unknown" since they + // may use the legacy encoding. This includes both CT_USER_FUNC and CT_INDIRECT + + needsVzeroupper = true; + break; + } + + case CT_HELPER: + { + // Most helpers are well known to not use any floating-point or SIMD logic internally, but + // a few do exist so we need to ensure they are handled. They are identified by taking or + // returning a floating-point or SIMD type, regardless of how it is actually passed/returned. + + if (varTypeUsesFloatReg(this)) + { + needsVzeroupper = true; + } + else + { + for (CallArg& arg : gtArgs.Args()) + { + if (varTypeUsesFloatReg(arg.GetSignatureType())) + { + needsVzeroupper = true; + break; + } + } + } + break; + } + + default: + { + unreached(); + } + } + } + + return needsVzeroupper; +} +#endif // TARGET_XARCH + //--------------------------------------------------------------- // GetOtherRegMask: Get the reg mask of gtOtherRegs of call node // diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index a1dd574b5a64d..49486d48750f4 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -5124,6 +5124,10 @@ struct GenTreeCall final : public GenTree #endif } +#ifdef TARGET_XARCH + bool NeedsVzeroupper(Compiler* comp); +#endif // TARGET_XARCH + // Get reg mask of all the valid registers of gtOtherRegs array regMaskTP GetOtherRegMask() const; diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 9e7bda9988006..f43febae6c3a4 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1341,6 +1341,16 @@ int LinearScan::BuildCall(GenTreeCall* call) srcCount += BuildOperandUses(ctrlExpr, ctrlExprCandidates); } + if (call->NeedsVzeroupper(compiler)) + { + // Much like for Contains256bitOrMoreAVX, we want to track if any + // call needs a vzeroupper inserted. This allows us to reduce + // the total number of vzeroupper being inserted for cases where + // no 256+ AVX is used directly by the method. + + compiler->GetEmitter()->SetContainsCallNeedingVzeroupper(true); + } + buildInternalRegisterUses(); // Now generate defs and kills.