Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update where and when vzeroupper is emitted #98261

Merged
merged 8 commits into from
Feb 13, 2024
2 changes: 0 additions & 2 deletions src/coreclr/jit/codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -476,8 +476,6 @@ class CodeGen final : public CodeGenInterface
// Save/Restore callee saved float regs to stack
void genPreserveCalleeSavedFltRegs(unsigned lclFrameSize);
void genRestoreCalleeSavedFltRegs(unsigned lclFrameSize);
// Generate VZeroupper instruction to avoid AVX/SSE transition penalty
void genVzeroupperIfNeeded(bool check256bitOnly = true);

#endif // TARGET_XARCH

Expand Down
82 changes: 39 additions & 43 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6074,16 +6074,18 @@ void CodeGen::genCall(GenTreeCall* call)
}
#endif // defined(DEBUG) && defined(TARGET_X86)

// When it's a PInvoke call and the call type is USER function, we issue VZEROUPPER here
// if the function contains 256bit AVX instructions, this is to avoid AVX-256 to Legacy SSE
// transition penalty, assuming the user function contains legacy SSE instruction.
// To limit code size increase impact: we only issue VZEROUPPER before PInvoke call, not issue
// VZEROUPPER after PInvoke call because transition penalty from legacy SSE to AVX only happens
// when there's preceding 256-bit AVX to legacy SSE transition penalty.
// This applies to 512bit AVX512 instructions as well.
if (call->IsPInvoke() && (call->gtCallType == CT_USER_FUNC) && (GetEmitter()->Contains256bitOrMoreAVX()))
{
assert(compiler->canUseVexEncoding());
if (GetEmitter()->Contains256bitOrMoreAVX() && call->NeedsVzeroupper(compiler))
{
// The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
// Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
// between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
// VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
// register) and before any call to an unknown function.

// This method contains a call that needs vzeroupper but also uses 256-bit or higher
// AVX itself. This means we couldn't optimize to only emitting a single vzeroupper in
// the method prologue and instead need to insert one before each call that needs it.

instGen(INS_vzeroupper);
}

Expand Down Expand Up @@ -11188,12 +11190,27 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu
// funclet frames: this will be FuncletInfo.fiSpDelta.
void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
{
genVzeroupperIfNeeded(false);
regMaskTP regMask = compiler->compCalleeFPRegsSavedMask;

// Only callee saved floating point registers should be in regMask
assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask);

if (GetEmitter()->ContainsCallNeedingVzeroupper() && !GetEmitter()->Contains256bitOrMoreAVX())
{
// The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
// Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
// between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
// VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
// register) and before any call to an unknown function.

// This method contains a call that needs vzeroupper but also doesn't use 256-bit or higher
// AVX itself. Thus we can optimize to only emitting a single vzeroupper in the function prologue
// This reduces the overall amount of codegen, particularly for more common paths not using any
// SIMD or floating-point.

instGen(INS_vzeroupper);
}

// fast path return
if (regMask == RBM_NONE)
{
Expand Down Expand Up @@ -11241,10 +11258,20 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
// Only callee saved floating point registers should be in regMask
assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask);

if (GetEmitter()->Contains256bitOrMoreAVX())
{
// The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
// Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
// between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
// VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
// register) and before any call to an unknown function.

instGen(INS_vzeroupper);
}

// fast path return
if (regMask == RBM_NONE)
{
genVzeroupperIfNeeded();
return;
}

Expand Down Expand Up @@ -11287,37 +11314,6 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
offset -= XMM_REGSIZE_BYTES;
}
}
genVzeroupperIfNeeded();
}

// Generate Vzeroupper instruction as needed to zero out upper 128b-bit of all YMM registers so that the
// AVX/Legacy SSE transition penalties can be avoided. This function is been used in genPreserveCalleeSavedFltRegs
// (prolog) and genRestoreCalleeSavedFltRegs (epilog). Issue VZEROUPPER in Prolog if the method contains
// 128-bit or 256-bit AVX code, to avoid legacy SSE to AVX transition penalty, which could happen when native
// code contains legacy SSE code calling into JIT AVX code (e.g. reverse pinvoke). Issue VZEROUPPER in Epilog
// if the method contains 256-bit AVX code, to avoid AVX to legacy SSE transition penalty.
//
// Params
// check256bitOnly - true to check if the function contains 256-bit AVX instruction and generate Vzeroupper
// instruction, false to check if the function contains AVX instruction (either 128-bit or 256-bit).
//
void CodeGen::genVzeroupperIfNeeded(bool check256bitOnly /* = true*/)
{
bool emitVzeroUpper = false;
if (check256bitOnly)
{
emitVzeroUpper = GetEmitter()->Contains256bitOrMoreAVX();
}
else
{
emitVzeroUpper = GetEmitter()->ContainsAVX();
}

if (emitVzeroUpper)
{
assert(compiler->canUseVexEncoding());
instGen(INS_vzeroupper);
}
}

//-----------------------------------------------------------------------------------
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2312,6 +2312,7 @@ void Compiler::compSetProcessor()
// Assume each JITted method does not contain AVX instruction at first
codeGen->GetEmitter()->SetContainsAVX(false);
codeGen->GetEmitter()->SetContains256bitOrMoreAVX(false);
codeGen->GetEmitter()->SetContainsCallNeedingVzeroupper(false);
}
if (canUseEvexEncoding())
{
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -9383,6 +9383,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
}

#ifdef TARGET_XARCH
public:
bool canUseVexEncoding() const
{
return compOpportunisticallyDependsOn(InstructionSet_AVX);
Expand All @@ -9399,6 +9400,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
return compOpportunisticallyDependsOn(InstructionSet_AVX512F);
}

private:
//------------------------------------------------------------------------
// DoJitStressEvexEncoding- Answer the question: Do we force EVEX encoding.
//
Expand Down
10 changes: 10 additions & 0 deletions src/coreclr/jit/emitxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,16 @@ void SetContains256bitOrMoreAVX(bool value)
contains256bitOrMoreAVXInstruction = value;
}

bool containsCallNeedingVzeroupper = false;
bool ContainsCallNeedingVzeroupper() const
{
return containsCallNeedingVzeroupper;
}
void SetContainsCallNeedingVzeroupper(bool value)
{
containsCallNeedingVzeroupper = value;
}

bool IsDstDstSrcAVXInstruction(instruction ins) const;
bool IsDstSrcSrcAVXInstruction(instruction ins) const;
bool IsThreeOperandAVXInstruction(instruction ins) const;
Expand Down
69 changes: 69 additions & 0 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2076,6 +2076,75 @@ void CallArgs::Remove(CallArg* arg)
assert(!"Did not find arg to remove in CallArgs::Remove");
}

#ifdef TARGET_XARCH
//---------------------------------------------------------------
// NeedsVzeroupper: Determines if the call needs a vzeroupper emitted before it is invoked
//
// Parameters:
// comp - the compiler
//
// Returns:
// true if a vzeroupper needs to be emitted; otherwise, false
//
bool GenTreeCall::NeedsVzeroupper(Compiler* comp)
{
bool needsVzeroupper = false;

if (IsPInvoke() && comp->canUseVexEncoding())
{
// The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
// Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
// between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
// VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
// register) and before any call to an unknown function.

switch (gtCallType)
{
case CT_USER_FUNC:
case CT_INDIRECT:
{
// Since P/Invokes are not compiled by the runtime, they are typically "unknown" since they
// may use the legacy encoding. This includes both CT_USER_FUNC and CT_INDIRECT

needsVzeroupper = true;
break;
}

case CT_HELPER:
{
// Most helpers are well known to not use any floating-point or SIMD logic internally, but
// a few do exist so we need to ensure they are handled. They are identified by taking or
// returning a floating-point or SIMD type, regardless of how it is actually passed/returned.

if (varTypeUsesFloatReg(this))
{
needsVzeroupper = true;
}
else
{
for (CallArg& arg : gtArgs.Args())
{
if (varTypeUsesFloatReg(arg.GetSignatureType()))
{
needsVzeroupper = true;
break;
}
}
}
break;
}

default:
{
unreached();
}
}
}

return needsVzeroupper;
}
#endif // TARGET_XARCH

//---------------------------------------------------------------
// GetOtherRegMask: Get the reg mask of gtOtherRegs of call node
//
Expand Down
4 changes: 4 additions & 0 deletions src/coreclr/jit/gentree.h
Original file line number Diff line number Diff line change
Expand Up @@ -5125,6 +5125,10 @@ struct GenTreeCall final : public GenTree
#endif
}

#ifdef TARGET_XARCH
bool NeedsVzeroupper(Compiler* comp);
#endif // TARGET_XARCH

// Get reg mask of all the valid registers of gtOtherRegs array
regMaskTP GetOtherRegMask() const;

Expand Down
10 changes: 10 additions & 0 deletions src/coreclr/jit/lsraxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1341,6 +1341,16 @@ int LinearScan::BuildCall(GenTreeCall* call)
srcCount += BuildOperandUses(ctrlExpr, ctrlExprCandidates);
}

if (call->NeedsVzeroupper(compiler))
tannergooding marked this conversation as resolved.
Show resolved Hide resolved
{
// Much like for Contains256bitOrMoreAVX, we want to track if any
// call needs a vzeroupper inserted. This allows us to reduce
// the total number of vzeroupper being inserted for cases where
// no 256+ AVX is used directly by the method.

compiler->GetEmitter()->SetContainsCallNeedingVzeroupper(true);
}

buildInternalRegisterUses();

// Now generate defs and kills.
Expand Down
Loading