Skip to content

Commit

Permalink
JIT: Added SVE GetFfr, SetFfr, LoadVectorFirstFaulting, GatherVectorF…
Browse files Browse the repository at this point in the history
…irstFaulting (#105595)

* Initial work

* FirstFaulting partially works

* Added template

* Trying to test first-faulting behavior

* Using BoundedMemory to test FirstFaulting behavior for LoadVector.

* Fix size in validation

* Added more helper functions. Added conditional select tests for LoadVectorFirstFaulting.

* Added first-faulting behavior tests for GatherVectorFirstFaulting

* Added GetFfr suffix-style APIs

* Fixing GatherVector tests

* Formatting

* Feedback

* Feedback

* Ensure the P/Invokes are blittable

* Fix build

* Remove checking for zeroes after the fault

* Added GatherVectorFirstFaultingVectorBases test template, but currently without the FirstFaulting test. Added SveFfrTest template.

* Mark GetFfr methods as side-effectful

* Verifying expected fault result. Test weaks.

* Fix build

* Add tracking of FFR register

somewhat workable

code cleanup

Remove FFR

Add all the GetFfr*

wip

Work with MskCns() model

Use physReg approach

Remove commented prototypes

working

Remove bunch of unnecessary code

Remove SpecialImport from GetFFR/SetFFR/LoadFirstFaulting

some more code cleanup

some fixup

* Change condition for PhysReg

* jit format

* Fix PoisonPage configuration while creating BoundedMemory

* Use mmap() instead of memalign() for memory allocation

* review feedback

* unspill for LoadVectorFirstFaulting as well

* Show error codes on failing failure

* Feedback

* Feedback

* Feedback

* Feedback

* Handle FFR correctly

* reuse some of the code

* Handle the special effect for SetFfr

* some fixes + test coverage

* do not zero init lvaFfrRegister

* reverted local change

* fix build break

* Fixing flags for OSX

* Fixup unix impl

* Trying to fix build

* Fix osx calls

* Fix unix impl - forgot to use 'value'

* Added default impl

* Updating error messages

* Attempt mprotect to determine which MAP_ANONYMOUS value to use

* Attempt mprotect to determine which MAP_ANONYMOUS value to use

* Add a way to query MAP_ANONYMOUS

* Add a way to query MAP_ANONYMOUS

* trying to figure out cmake reference

* trying to figure out cmake reference

* trying to figure out cmake reference

* forgot to add cpp files

* Use MemoryMappedFile and then use mprotect

* Minor cleanup

* Added XplatVirtualAlloc, hopefully it will work

* Fix build

* Fix build

* Update hwintrinsiccodegenarm64.cpp

* Trying to fix build

* Use SystemNative

* fix pinvoke

* Get rid of writeline

* Add mono check

* Fix misspelled word

* Check for wasm

* Fix build

* Use IsBrowser

* Check IsWasi

* Check IsBrowser a different way

* Check IsMonoRuntime

* Feedback. Check NETFRAMEWORK

* Simplify check

---------

Co-authored-by: Tanner Gooding <tagoo@outlook.com>
Co-authored-by: Kunal Pathak <Kunal.Pathak@microsoft.com>
Co-authored-by: Swapnil Gaikwad <swapnil.gaikwad@arm.com>
  • Loading branch information
4 people authored Aug 2, 2024
1 parent f120fff commit df09fd1
Show file tree
Hide file tree
Showing 25 changed files with 3,384 additions and 168 deletions.
4 changes: 4 additions & 0 deletions src/coreclr/jit/compiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4245,6 +4245,10 @@ bool Compiler::fgVarIsNeverZeroInitializedInProlog(unsigned varNum)
bool result = varDsc->lvIsParam || lvaIsOSRLocal(varNum) || (varNum == lvaGSSecurityCookie) ||
(varNum == lvaInlinedPInvokeFrameVar) || (varNum == lvaStubArgumentVar) || (varNum == lvaRetAddrVar);

#ifdef TARGET_ARM64
result = result || (varNum == lvaFfrRegister);
#endif

#if FEATURE_FIXED_OUT_ARGS
result = result || (varNum == lvaOutgoingArgSpaceVar);
#endif
Expand Down
9 changes: 5 additions & 4 deletions src/coreclr/jit/fgdiagnostic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3428,14 +3428,15 @@ void Compiler::fgDebugCheckFlags(GenTree* tree, BasicBlock* block)

#if defined(TARGET_ARM64)
case NI_ArmBase_Yield:
case NI_Sve_PrefetchBytes:
case NI_Sve_PrefetchInt16:
case NI_Sve_PrefetchInt32:
case NI_Sve_PrefetchInt64:
case NI_Sve_GatherPrefetch16Bit:
case NI_Sve_GatherPrefetch32Bit:
case NI_Sve_GatherPrefetch64Bit:
case NI_Sve_GatherPrefetch8Bit:
case NI_Sve_PrefetchBytes:
case NI_Sve_PrefetchInt16:
case NI_Sve_PrefetchInt32:
case NI_Sve_PrefetchInt64:
case NI_Sve_SetFfr:
{
assert(tree->OperRequiresCallFlag(this));
expectedFlags |= GTF_GLOB_REF;
Expand Down
48 changes: 25 additions & 23 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26743,6 +26743,18 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const
addr = Op(3);
break;

case NI_Sve_GatherVector:
case NI_Sve_GatherVectorByteZeroExtend:
case NI_Sve_GatherVectorFirstFaulting:
case NI_Sve_GatherVectorInt16SignExtend:
case NI_Sve_GatherVectorInt16WithByteOffsetsSignExtend:
case NI_Sve_GatherVectorInt32SignExtend:
case NI_Sve_GatherVectorInt32WithByteOffsetsSignExtend:
case NI_Sve_GatherVectorSByteSignExtend:
case NI_Sve_GatherVectorUInt16WithByteOffsetsZeroExtend:
case NI_Sve_GatherVectorUInt16ZeroExtend:
case NI_Sve_GatherVectorUInt32WithByteOffsetsZeroExtend:
case NI_Sve_GatherVectorUInt32ZeroExtend:
case NI_Sve_GatherVectorWithByteOffsets:
case NI_Sve_LoadVector:
case NI_Sve_LoadVectorNonTemporal:
Expand All @@ -26753,6 +26765,7 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const
case NI_Sve_LoadVectorByteZeroExtendToUInt16:
case NI_Sve_LoadVectorByteZeroExtendToUInt32:
case NI_Sve_LoadVectorByteZeroExtendToUInt64:
case NI_Sve_LoadVectorFirstFaulting:
case NI_Sve_LoadVectorInt16SignExtendToInt32:
case NI_Sve_LoadVectorInt16SignExtendToInt64:
case NI_Sve_LoadVectorInt16SignExtendToUInt32:
Expand All @@ -26777,20 +26790,6 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const
addr = Op(2);
break;

case NI_Sve_GatherVector:
case NI_Sve_GatherVectorByteZeroExtend:
case NI_Sve_GatherVectorInt16SignExtend:
case NI_Sve_GatherVectorInt16WithByteOffsetsSignExtend:
case NI_Sve_GatherVectorInt32SignExtend:
case NI_Sve_GatherVectorInt32WithByteOffsetsSignExtend:
case NI_Sve_GatherVectorSByteSignExtend:
case NI_Sve_GatherVectorUInt16WithByteOffsetsZeroExtend:
case NI_Sve_GatherVectorUInt16ZeroExtend:
case NI_Sve_GatherVectorUInt32WithByteOffsetsZeroExtend:
case NI_Sve_GatherVectorUInt32ZeroExtend:
addr = Op(2);
break;

#endif // TARGET_ARM64

default:
Expand Down Expand Up @@ -26870,11 +26869,12 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const
{
#ifdef TARGET_ARM64
static_assert_no_msg(
AreContiguous(NI_Sve_GatherVector, NI_Sve_GatherVectorByteZeroExtend, NI_Sve_GatherVectorInt16SignExtend,
NI_Sve_GatherVectorInt16WithByteOffsetsSignExtend, NI_Sve_GatherVectorInt32SignExtend,
NI_Sve_GatherVectorInt32WithByteOffsetsSignExtend, NI_Sve_GatherVectorSByteSignExtend,
NI_Sve_GatherVectorUInt16WithByteOffsetsZeroExtend, NI_Sve_GatherVectorUInt16ZeroExtend,
NI_Sve_GatherVectorUInt32WithByteOffsetsZeroExtend, NI_Sve_GatherVectorUInt32ZeroExtend));
AreContiguous(NI_Sve_GatherVector, NI_Sve_GatherVectorByteZeroExtend, NI_Sve_GatherVectorFirstFaulting,
NI_Sve_GatherVectorInt16SignExtend, NI_Sve_GatherVectorInt16WithByteOffsetsSignExtend,
NI_Sve_GatherVectorInt32SignExtend, NI_Sve_GatherVectorInt32WithByteOffsetsSignExtend,
NI_Sve_GatherVectorSByteSignExtend, NI_Sve_GatherVectorUInt16WithByteOffsetsZeroExtend,
NI_Sve_GatherVectorUInt16ZeroExtend, NI_Sve_GatherVectorUInt32WithByteOffsetsZeroExtend,
NI_Sve_GatherVectorUInt32ZeroExtend));
assert(varTypeIsI(addr) || (varTypeIsSIMD(addr) && ((intrinsicId >= NI_Sve_GatherVector) &&
(intrinsicId <= NI_Sve_GatherVectorUInt32ZeroExtend))));
#else
Expand Down Expand Up @@ -27292,6 +27292,7 @@ bool GenTreeHWIntrinsic::OperRequiresCallFlag() const
case NI_Sve_GatherPrefetch32Bit:
case NI_Sve_GatherPrefetch64Bit:
case NI_Sve_GatherPrefetch8Bit:
case NI_Sve_SetFfr:
{
return true;
}
Expand Down Expand Up @@ -27474,14 +27475,15 @@ void GenTreeHWIntrinsic::Initialize(NamedIntrinsic intrinsicId)

#if defined(TARGET_ARM64)
case NI_ArmBase_Yield:
case NI_Sve_PrefetchBytes:
case NI_Sve_PrefetchInt16:
case NI_Sve_PrefetchInt32:
case NI_Sve_PrefetchInt64:
case NI_Sve_GatherPrefetch16Bit:
case NI_Sve_GatherPrefetch32Bit:
case NI_Sve_GatherPrefetch64Bit:
case NI_Sve_GatherPrefetch8Bit:
case NI_Sve_PrefetchBytes:
case NI_Sve_PrefetchInt16:
case NI_Sve_PrefetchInt32:
case NI_Sve_PrefetchInt64:
case NI_Sve_SetFfr:
{
// Mark as a call and global reference, much as is done for GT_KEEPALIVE
gtFlags |= (GTF_CALL | GTF_GLOB_REF);
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/hwintrinsic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2165,6 +2165,7 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic,
#elif defined(TARGET_ARM64)
case NI_Sve_GatherVector:
case NI_Sve_GatherVectorByteZeroExtend:
case NI_Sve_GatherVectorFirstFaulting:
case NI_Sve_GatherVectorInt16SignExtend:
case NI_Sve_GatherVectorInt16WithByteOffsetsSignExtend:
case NI_Sve_GatherVectorInt32SignExtend:
Expand Down
53 changes: 39 additions & 14 deletions src/coreclr/jit/hwintrinsiccodegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2049,6 +2049,33 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
break;
}

case NI_Sve_GatherVectorFirstFaulting:
{
if (node->GetAuxiliaryType() == TYP_UNKNOWN)
{
if (intrin.numOperands == 3)
{
// We have extra argument which means there is a "use" of FFR here. Restore it back in FFR
// register.
assert(op3Reg != REG_NA);
GetEmitter()->emitIns_R(INS_sve_wrffr, emitSize, op3Reg, opt);
}
}
else
{
// AuxilaryType is added only for numOperands == 3. If there is an extra argument, we need to
// "use" FFR here. Restore it back in FFR register.

if (intrin.numOperands == 4)
{
// We have extra argument which means there is a "use" of FFR here. Restore it back in FFR
// register.
assert(op4Reg != REG_NA);
GetEmitter()->emitIns_R(INS_sve_wrffr, emitSize, op4Reg, opt);
}
}
FALLTHROUGH;
}
case NI_Sve_GatherVector:
case NI_Sve_GatherVectorByteZeroExtend:
case NI_Sve_GatherVectorInt16SignExtend:
Expand All @@ -2065,25 +2092,24 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
{
// GatherVector...(Vector<T> mask, T* address, Vector<T2> indices)

assert(intrin.numOperands == 3);
emitAttr baseSize = emitActualTypeSize(intrin.baseType);
insScalableOpts sopt = INS_SCALABLE_OPTS_NONE;
emitAttr baseSize = emitActualTypeSize(intrin.baseType);
bool isLoadingBytes = ((ins == INS_sve_ld1b) || (ins == INS_sve_ld1sb) || (ins == INS_sve_ldff1b) ||
(ins == INS_sve_ldff1sb));
insScalableOpts sopt = INS_SCALABLE_OPTS_NONE;

if (baseSize == EA_8BYTE)
{
// Index is multiplied.
sopt = (ins == INS_sve_ld1b || ins == INS_sve_ld1sb) ? INS_SCALABLE_OPTS_NONE
: INS_SCALABLE_OPTS_LSL_N;
}
else
if (baseSize == EA_4BYTE)
{
// Index is sign or zero extended to 64bits, then multiplied.
assert(baseSize == EA_4BYTE);
opt = varTypeIsUnsigned(node->GetAuxiliaryType()) ? INS_OPTS_SCALABLE_S_UXTW
: INS_OPTS_SCALABLE_S_SXTW;

sopt = (ins == INS_sve_ld1b || ins == INS_sve_ld1sb) ? INS_SCALABLE_OPTS_NONE
: INS_SCALABLE_OPTS_MOD_N;
sopt = isLoadingBytes ? INS_SCALABLE_OPTS_NONE : INS_SCALABLE_OPTS_MOD_N;
}
else
{
// Index is multiplied.
assert(baseSize == EA_8BYTE);
sopt = isLoadingBytes ? INS_SCALABLE_OPTS_NONE : INS_SCALABLE_OPTS_LSL_N;
}

GetEmitter()->emitIns_R_R_R_R(ins, emitSize, targetReg, op1Reg, op2Reg, op3Reg, opt, sopt);
Expand All @@ -2092,7 +2118,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
{
// GatherVector...(Vector<T> mask, Vector<T2> addresses)

assert(intrin.numOperands == 2);
GetEmitter()->emitIns_R_R_R_I(ins, emitSize, targetReg, op1Reg, op2Reg, 0, opt);
}

Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/hwintrinsiclistarm64sve.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ HARDWARE_INTRINSIC(Sve, GatherPrefetch64Bit,
HARDWARE_INTRINSIC(Sve, GatherPrefetch8Bit, -1, -1, {INS_sve_prfb, INS_sve_prfb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasImmediateOperand|HW_Flag_HasEnumOperand|HW_Flag_SpecialSideEffect_Other)
HARDWARE_INTRINSIC(Sve, GatherVector, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1w, INS_sve_ld1w, INS_sve_ld1d, INS_sve_ld1d, INS_sve_ld1w, INS_sve_ld1d}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, GatherVectorByteZeroExtend, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1b, INS_sve_ld1b, INS_sve_ld1b, INS_sve_ld1b, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, GatherVectorFirstFaulting, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ldff1w, INS_sve_ldff1w, INS_sve_ldff1d, INS_sve_ldff1d, INS_sve_ldff1w, INS_sve_ldff1d}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_SpecialSideEffectMask)
HARDWARE_INTRINSIC(Sve, GatherVectorInt16SignExtend, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1sh, INS_sve_ld1sh, INS_sve_ld1sh, INS_sve_ld1sh, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, GatherVectorInt16WithByteOffsetsSignExtend, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1sh, INS_sve_ld1sh, INS_sve_ld1sh, INS_sve_ld1sh, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, GatherVectorInt32SignExtend, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1sw, INS_sve_ld1sw, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
Expand Down
51 changes: 49 additions & 2 deletions src/coreclr/jit/lowerarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1775,6 +1775,50 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)

break;
}
case NI_Sve_GatherVectorFirstFaulting:
{
LIR::Use use;
bool foundUse = BlockRange().TryGetUse(node, &use);

if (m_ffrTrashed)
{
// Consume the FFR register value from local variable to simulate "use" of FFR,
// only if it was trashed. If it was not trashed, we do not have to reload the
// contents of the FFR register.

unsigned lclNum = comp->getFFRegisterVarNum();
GenTree* lclVar = comp->gtNewLclvNode(lclNum, TYP_MASK);
BlockRange().InsertBefore(node, lclVar);
LowerNode(lclVar);

if (node->GetOperandCount() == 3)
{
assert(node->GetAuxiliaryType() != TYP_UNKNOWN);
node->ResetHWIntrinsicId(intrinsicId, comp, node->Op(1), node->Op(2), node->Op(3), lclVar);
}
else
{
assert(node->GetOperandCount() == 2);
node->ResetHWIntrinsicId(intrinsicId, comp, node->Op(1), node->Op(2), lclVar);
}
}

if (foundUse)
{
unsigned tmpNum = comp->lvaGrabTemp(true DEBUGARG("Return value result/FFR"));
LclVarDsc* tmpVarDsc = comp->lvaGetDesc(tmpNum);
tmpVarDsc->lvType = node->TypeGet();
GenTree* storeLclVar;
use.ReplaceWithLclVar(comp, tmpNum, &storeLclVar);
}
else
{
node->SetUnusedValue();
}

StoreFFRValue(node);
break;
}
case NI_Sve_LoadVectorFirstFaulting:
{
LIR::Use use;
Expand All @@ -1786,7 +1830,8 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
// only if it was trashed. If it was not trashed, we do not have to reload the
// contents of the FFR register.

GenTree* lclVar = comp->gtNewLclvNode(comp->lvaFfrRegister, TYP_MASK);
unsigned lclNum = comp->getFFRegisterVarNum();
GenTree* lclVar = comp->gtNewLclvNode(lclNum, TYP_MASK);
BlockRange().InsertBefore(node, lclVar);
LowerNode(lclVar);

Expand Down Expand Up @@ -4083,8 +4128,10 @@ void Lowering::StoreFFRValue(GenTreeHWIntrinsic* node)
#ifdef DEBUG
switch (node->GetHWIntrinsicId())
{
case NI_Sve_SetFfr:
case NI_Sve_GatherVectorFirstFaulting:
case NI_Sve_LoadVectorFirstFaulting:
case NI_Sve_SetFfr:

break;
default:
assert(!"Unexpected HWIntrinsicId");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ namespace System.Buffers
/// </summary>
public static partial class BoundedMemory
{
public static bool UnixBoundsEnabled { get; set; }
private static readonly int SystemPageSize = Environment.SystemPageSize;

/// <summary>
/// Allocates a new <see cref="BoundedMemory{T}"/> region which is immediately preceded by
/// or immediately followed by a poison (MEM_NOACCESS) page. If <paramref name="placement"/>
Expand Down Expand Up @@ -82,10 +85,15 @@ private static BoundedMemory<T> AllocateWithoutDataPopulation<T>(int elementCoun
{
return AllocateWithoutDataPopulationWindows<T>(elementCount, placement);
}
else
#if NETFRAMEWORK
return AllocateWithoutDataPopulationDefault<T>(elementCount, placement);
#else
else if (OperatingSystem.IsBrowser() || OperatingSystem.IsWasi())
{
return AllocateWithoutDataPopulationUnix<T>(elementCount, placement);
return AllocateWithoutDataPopulationDefault<T>(elementCount, placement);
}
return AllocateWithoutDataPopulationUnix<T>(elementCount, placement);
#endif
}
}
}
Loading

0 comments on commit df09fd1

Please sign in to comment.