Skip to content

Commit

Permalink
Update where and when vzeroupper is emitted (#98261)
Browse files Browse the repository at this point in the history
* Update where and when vzeroupper is emitted

* Ensure we emit vzeroupper for JIT helpers that need it

* Make sure vzeroupper is in genRestoreCalleeSavedFltRegs

* Scope when vzeroupper is emitted to fewer places

* Revert the simplification done to SetContainsAVXFlags

* Try to minify the TP impact of the improved vzeroupper handling
  • Loading branch information
tannergooding authored Feb 13, 2024
1 parent b9bd1de commit 6d877c5
Show file tree
Hide file tree
Showing 8 changed files with 135 additions and 45 deletions.
2 changes: 0 additions & 2 deletions src/coreclr/jit/codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -476,8 +476,6 @@ class CodeGen final : public CodeGenInterface
// Save/Restore callee saved float regs to stack
void genPreserveCalleeSavedFltRegs(unsigned lclFrameSize);
void genRestoreCalleeSavedFltRegs(unsigned lclFrameSize);
// Generate VZeroupper instruction to avoid AVX/SSE transition penalty
void genVzeroupperIfNeeded(bool check256bitOnly = true);

#endif // TARGET_XARCH

Expand Down
82 changes: 39 additions & 43 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6074,16 +6074,18 @@ void CodeGen::genCall(GenTreeCall* call)
}
#endif // defined(DEBUG) && defined(TARGET_X86)

// When it's a PInvoke call and the call type is USER function, we issue VZEROUPPER here
// if the function contains 256bit AVX instructions, this is to avoid AVX-256 to Legacy SSE
// transition penalty, assuming the user function contains legacy SSE instruction.
// To limit code size increase impact: we only issue VZEROUPPER before PInvoke call, not issue
// VZEROUPPER after PInvoke call because transition penalty from legacy SSE to AVX only happens
// when there's preceding 256-bit AVX to legacy SSE transition penalty.
// This applies to 512bit AVX512 instructions as well.
if (call->IsPInvoke() && (call->gtCallType == CT_USER_FUNC) && (GetEmitter()->Contains256bitOrMoreAVX()))
{
assert(compiler->canUseVexEncoding());
if (GetEmitter()->Contains256bitOrMoreAVX() && call->NeedsVzeroupper(compiler))
{
// The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
// Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
// between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
// VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
// register) and before any call to an unknown function.

// This method contains a call that needs vzeroupper but also uses 256-bit or higher
// AVX itself. This means we couldn't optimize to only emitting a single vzeroupper in
// the method prologue and instead need to insert one before each call that needs it.

instGen(INS_vzeroupper);
}

Expand Down Expand Up @@ -11188,12 +11190,27 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu
// funclet frames: this will be FuncletInfo.fiSpDelta.
void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
{
genVzeroupperIfNeeded(false);
regMaskTP regMask = compiler->compCalleeFPRegsSavedMask;

// Only callee saved floating point registers should be in regMask
assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask);

if (GetEmitter()->ContainsCallNeedingVzeroupper() && !GetEmitter()->Contains256bitOrMoreAVX())
{
// The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
// Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
// between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
// VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
// register) and before any call to an unknown function.

// This method contains a call that needs vzeroupper but also doesn't use 256-bit or higher
// AVX itself. Thus we can optimize to only emitting a single vzeroupper in the function prologue
// This reduces the overall amount of codegen, particularly for more common paths not using any
// SIMD or floating-point.

instGen(INS_vzeroupper);
}

// fast path return
if (regMask == RBM_NONE)
{
Expand Down Expand Up @@ -11241,10 +11258,20 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
// Only callee saved floating point registers should be in regMask
assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask);

if (GetEmitter()->Contains256bitOrMoreAVX())
{
// The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
// Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
// between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
// VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
// register) and before any call to an unknown function.

instGen(INS_vzeroupper);
}

// fast path return
if (regMask == RBM_NONE)
{
genVzeroupperIfNeeded();
return;
}

Expand Down Expand Up @@ -11287,37 +11314,6 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
offset -= XMM_REGSIZE_BYTES;
}
}
genVzeroupperIfNeeded();
}

// Generate Vzeroupper instruction as needed to zero out upper 128b-bit of all YMM registers so that the
// AVX/Legacy SSE transition penalties can be avoided. This function is been used in genPreserveCalleeSavedFltRegs
// (prolog) and genRestoreCalleeSavedFltRegs (epilog). Issue VZEROUPPER in Prolog if the method contains
// 128-bit or 256-bit AVX code, to avoid legacy SSE to AVX transition penalty, which could happen when native
// code contains legacy SSE code calling into JIT AVX code (e.g. reverse pinvoke). Issue VZEROUPPER in Epilog
// if the method contains 256-bit AVX code, to avoid AVX to legacy SSE transition penalty.
//
// Params
// check256bitOnly - true to check if the function contains 256-bit AVX instruction and generate Vzeroupper
// instruction, false to check if the function contains AVX instruction (either 128-bit or 256-bit).
//
void CodeGen::genVzeroupperIfNeeded(bool check256bitOnly /* = true*/)
{
bool emitVzeroUpper = false;
if (check256bitOnly)
{
emitVzeroUpper = GetEmitter()->Contains256bitOrMoreAVX();
}
else
{
emitVzeroUpper = GetEmitter()->ContainsAVX();
}

if (emitVzeroUpper)
{
assert(compiler->canUseVexEncoding());
instGen(INS_vzeroupper);
}
}

//-----------------------------------------------------------------------------------
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2312,6 +2312,7 @@ void Compiler::compSetProcessor()
// Assume each JITted method does not contain AVX instruction at first
codeGen->GetEmitter()->SetContainsAVX(false);
codeGen->GetEmitter()->SetContains256bitOrMoreAVX(false);
codeGen->GetEmitter()->SetContainsCallNeedingVzeroupper(false);
}
if (canUseEvexEncoding())
{
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -9383,6 +9383,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
}

#ifdef TARGET_XARCH
public:
bool canUseVexEncoding() const
{
return compOpportunisticallyDependsOn(InstructionSet_AVX);
Expand All @@ -9399,6 +9400,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
return compOpportunisticallyDependsOn(InstructionSet_AVX512F);
}

private:
//------------------------------------------------------------------------
// DoJitStressEvexEncoding- Answer the question: Do we force EVEX encoding.
//
Expand Down
10 changes: 10 additions & 0 deletions src/coreclr/jit/emitxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,16 @@ void SetContains256bitOrMoreAVX(bool value)
contains256bitOrMoreAVXInstruction = value;
}

bool containsCallNeedingVzeroupper = false;
bool ContainsCallNeedingVzeroupper() const
{
return containsCallNeedingVzeroupper;
}
void SetContainsCallNeedingVzeroupper(bool value)
{
containsCallNeedingVzeroupper = value;
}

bool IsDstDstSrcAVXInstruction(instruction ins) const;
bool IsDstSrcSrcAVXInstruction(instruction ins) const;
bool IsThreeOperandAVXInstruction(instruction ins) const;
Expand Down
69 changes: 69 additions & 0 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2076,6 +2076,75 @@ void CallArgs::Remove(CallArg* arg)
assert(!"Did not find arg to remove in CallArgs::Remove");
}

#ifdef TARGET_XARCH
//---------------------------------------------------------------
// NeedsVzeroupper: Determines if the call needs a vzeroupper emitted before it is invoked
//
// Parameters:
// comp - the compiler
//
// Returns:
// true if a vzeroupper needs to be emitted; otherwise, false
//
bool GenTreeCall::NeedsVzeroupper(Compiler* comp)
{
bool needsVzeroupper = false;

if (IsPInvoke() && comp->canUseVexEncoding())
{
// The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
// Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
// between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
// VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
// register) and before any call to an unknown function.

switch (gtCallType)
{
case CT_USER_FUNC:
case CT_INDIRECT:
{
// Since P/Invokes are not compiled by the runtime, they are typically "unknown" since they
// may use the legacy encoding. This includes both CT_USER_FUNC and CT_INDIRECT

needsVzeroupper = true;
break;
}

case CT_HELPER:
{
// Most helpers are well known to not use any floating-point or SIMD logic internally, but
// a few do exist so we need to ensure they are handled. They are identified by taking or
// returning a floating-point or SIMD type, regardless of how it is actually passed/returned.

if (varTypeUsesFloatReg(this))
{
needsVzeroupper = true;
}
else
{
for (CallArg& arg : gtArgs.Args())
{
if (varTypeUsesFloatReg(arg.GetSignatureType()))
{
needsVzeroupper = true;
break;
}
}
}
break;
}

default:
{
unreached();
}
}
}

return needsVzeroupper;
}
#endif // TARGET_XARCH

//---------------------------------------------------------------
// GetOtherRegMask: Get the reg mask of gtOtherRegs of call node
//
Expand Down
4 changes: 4 additions & 0 deletions src/coreclr/jit/gentree.h
Original file line number Diff line number Diff line change
Expand Up @@ -5124,6 +5124,10 @@ struct GenTreeCall final : public GenTree
#endif
}

#ifdef TARGET_XARCH
bool NeedsVzeroupper(Compiler* comp);
#endif // TARGET_XARCH

// Get reg mask of all the valid registers of gtOtherRegs array
regMaskTP GetOtherRegMask() const;

Expand Down
10 changes: 10 additions & 0 deletions src/coreclr/jit/lsraxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1341,6 +1341,16 @@ int LinearScan::BuildCall(GenTreeCall* call)
srcCount += BuildOperandUses(ctrlExpr, ctrlExprCandidates);
}

if (call->NeedsVzeroupper(compiler))
{
// Much like for Contains256bitOrMoreAVX, we want to track if any
// call needs a vzeroupper inserted. This allows us to reduce
// the total number of vzeroupper being inserted for cases where
// no 256+ AVX is used directly by the method.

compiler->GetEmitter()->SetContainsCallNeedingVzeroupper(true);
}

buildInternalRegisterUses();

// Now generate defs and kills.
Expand Down

0 comments on commit 6d877c5

Please sign in to comment.