Skip to content

Commit

Permalink
[AArch64] Add invalid 1 x vscale costs for reductions and reduction-o…
Browse files Browse the repository at this point in the history
…perations.

The code-generator is currently not able to handle scalable vectors of <vscale
x 1 x eltty>. The usual "fix" for this until it is supported is to mark the
costs of loads/stores with an invalid cost, preventing the vectorizer from
vectorizing at those factors. But on rare occasions loops don't contain
load/stores, only reductions.

So whilst this is still unsupported return an invalid cost to avoid selecting
vscale x 1 VFs.  The cost of a reduction is not currently used by the
vectorizer so this adds the cost to the add/mul/and/or/xor or min/max that
should feed the reduction.  This change will be removed when code-generation
for these types is sufficiently reliable.

Fixes #99760
  • Loading branch information
davemgreen committed Aug 6, 2024
1 parent 6e4c580 commit 03874ee
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 0 deletions.
32 changes: 32 additions & 0 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -541,7 +541,15 @@ static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) {
InstructionCost
AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind) {
// The code-generator is currently not able to handle scalable vectors
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
// it. This change will be removed when code-generation for these types is
// sufficiently reliable.
auto *RetTy = ICA.getReturnType();
if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
if (VTy->getElementCount() == ElementCount::getScalable(1))
return InstructionCost::getInvalid();

switch (ICA.getID()) {
case Intrinsic::experimental_vector_histogram_add:
if (!ST->hasSVE2())
Expand Down Expand Up @@ -3070,6 +3078,14 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
ArrayRef<const Value *> Args,
const Instruction *CxtI) {

// The code-generator is currently not able to handle scalable vectors
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
// it. This change will be removed when code-generation for these types is
// sufficiently reliable.
if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
if (VTy->getElementCount() == ElementCount::getScalable(1))
return InstructionCost::getInvalid();

// TODO: Handle more cost kinds.
if (CostKind != TTI::TCK_RecipThroughput)
return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
Expand Down Expand Up @@ -3844,6 +3860,14 @@ InstructionCost
AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
FastMathFlags FMF,
TTI::TargetCostKind CostKind) {
// The code-generator is currently not able to handle scalable vectors
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
// it. This change will be removed when code-generation for these types is
// sufficiently reliable.
if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
if (VTy->getElementCount() == ElementCount::getScalable(1))
return InstructionCost::getInvalid();

std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);

if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
Expand Down Expand Up @@ -3888,6 +3912,14 @@ InstructionCost
AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
std::optional<FastMathFlags> FMF,
TTI::TargetCostKind CostKind) {
// The code-generator is currently not able to handle scalable vectors
// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
// it. This change will be removed when code-generation for these types is
// sufficiently reliable.
if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
if (VTy->getElementCount() == ElementCount::getScalable(1))
return InstructionCost::getInvalid();

if (TTI::requiresOrderedReduction(FMF)) {
if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
InstructionCost BaseCost =
Expand Down
4 changes: 4 additions & 0 deletions llvm/test/Analysis/CostModel/AArch64/arith-fp-sve.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ define void @fadd() {
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fadd <vscale x 4 x half> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fadd <vscale x 8 x half> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fadd <vscale x 16 x half> undef, undef
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %V1F32 = fadd <vscale x 1 x float> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fadd <vscale x 2 x float> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fadd <vscale x 4 x float> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fadd <vscale x 8 x float> undef, undef
Expand All @@ -19,6 +20,7 @@ define void @fadd() {
%V8F16 = fadd <vscale x 8 x half> undef, undef
%V16F16 = fadd <vscale x 16 x half> undef, undef

%V1F32 = fadd <vscale x 1 x float> undef, undef
%V2F32 = fadd <vscale x 2 x float> undef, undef
%V4F32 = fadd <vscale x 4 x float> undef, undef
%V8F32 = fadd <vscale x 8 x float> undef, undef
Expand All @@ -34,6 +36,7 @@ define void @fsub() {
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fsub <vscale x 4 x half> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fsub <vscale x 8 x half> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fsub <vscale x 16 x half> undef, undef
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %V1F32 = fsub <vscale x 1 x float> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fsub <vscale x 2 x float> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fsub <vscale x 4 x float> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fsub <vscale x 8 x float> undef, undef
Expand All @@ -45,6 +48,7 @@ define void @fsub() {
%V8F16 = fsub <vscale x 8 x half> undef, undef
%V16F16 = fsub <vscale x 16 x half> undef, undef

%V1F32 = fsub <vscale x 1 x float> undef, undef
%V2F32 = fsub <vscale x 2 x float> undef, undef
%V4F32 = fsub <vscale x 4 x float> undef, undef
%V8F32 = fsub <vscale x 8 x float> undef, undef
Expand Down
21 changes: 21 additions & 0 deletions llvm/test/Analysis/CostModel/AArch64/sve-arith.ll
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,34 @@ define void @scalable_mul() #0 {
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_nxv8i16 = mul <vscale x 8 x i16> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_nxv4i32 = mul <vscale x 4 x i32> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_nxv2i64 = mul <vscale x 2 x i64> undef, undef
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %mul_nxv1i64 = mul <vscale x 1 x i64> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
entry:
%mul_nxv16i8 = mul <vscale x 16 x i8> undef, undef
%mul_nxv8i16 = mul <vscale x 8 x i16> undef, undef
%mul_nxv4i32 = mul <vscale x 4 x i32> undef, undef
%mul_nxv2i64 = mul <vscale x 2 x i64> undef, undef
%mul_nxv1i64 = mul <vscale x 1 x i64> undef, undef

ret void
}

define void @scalable_add() #0 {
; CHECK-LABEL: 'scalable_add'
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add_nxv16i8 = add <vscale x 16 x i8> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add_nxv8i16 = add <vscale x 8 x i16> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add_nxv4i32 = add <vscale x 4 x i32> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add_nxv2i64 = add <vscale x 2 x i64> undef, undef
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %add_nxv1i64 = add <vscale x 1 x i64> undef, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
entry:
%add_nxv16i8 = add <vscale x 16 x i8> undef, undef
%add_nxv8i16 = add <vscale x 8 x i16> undef, undef
%add_nxv4i32 = add <vscale x 4 x i32> undef, undef
%add_nxv2i64 = add <vscale x 2 x i64> undef, undef
%add_nxv1i64 = add <vscale x 1 x i64> undef, undef

ret void
}
Expand Down
21 changes: 21 additions & 0 deletions llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -116,16 +116,20 @@ declare <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float>, i64

define void @reductions(<vscale x 4 x i32> %v0, <vscale x 4 x i64> %v1, <vscale x 4 x float> %v2, <vscale x 4 x double> %v3) {
; CHECK-LABEL: 'reductions'
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %add_nxv1i32 = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %add_nxv4i32 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %add_nxv4i64 = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %v1)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %mul_nxv1i32 = call i32 @llvm.vector.reduce.mul.nxv1i32(<vscale x 1 x i32> undef)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %mul_nxv4i32 = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %mul_nxv4i64 = call i64 @llvm.vector.reduce.mul.nxv4i64(<vscale x 4 x i64> %v1)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %and_nxv1i32 = call i32 @llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %and_nxv4i32 = call i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %and_nxv4i64 = call i64 @llvm.vector.reduce.and.nxv4i64(<vscale x 4 x i64> %v1)
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %or_nxv4i32 = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %or_nxv4i64 = call i64 @llvm.vector.reduce.or.nxv4i64(<vscale x 4 x i64> %v1)
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %xor_nxv4i32 = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %xor_nxv4i64 = call i64 @llvm.vector.reduce.xor.nxv4i64(<vscale x 4 x i64> %v1)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %umin_nxv1i64 = call i64 @llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %umin_nxv4i32 = call i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %umin_nxv4i64 = call i64 @llvm.vector.reduce.umin.nxv4i64(<vscale x 4 x i64> %v1)
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %smin_nxv4i32 = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> %v0)
Expand All @@ -134,25 +138,32 @@ define void @reductions(<vscale x 4 x i32> %v0, <vscale x 4 x i64> %v1, <vscale
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %umax_nxv4i64 = call i64 @llvm.vector.reduce.umax.nxv4i64(<vscale x 4 x i64> %v1)
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %smax_nxv4i32 = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> %v0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %smax_nxv4i64 = call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> %v1)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %fadd_nxv1f32 = call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.000000e+00, <vscale x 1 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_nxv4f32 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %v2)
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_nxv4f64 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v3)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %fmin_nxv1f32 = call fast float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmin_nxv4f32 = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> %v2)
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmin_nxv4f64 = call fast double @llvm.vector.reduce.fmin.nxv4f64(<vscale x 4 x double> %v3)
; CHECK-NEXT: Cost Model: Invalid cost for instruction: %fmax_nxv1f32 = call fast float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmax_nxv4f32 = call fast float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> %v2)
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmax_nxv4f64 = call fast double @llvm.vector.reduce.fmax.nxv4f64(<vscale x 4 x double> %v3)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; TYPE_BASED_ONLY-LABEL: 'reductions'
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %add_nxv1i32 = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> undef)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %add_nxv4i32 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %v0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %add_nxv4i64 = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %v1)
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mul_nxv1i32 = call i32 @llvm.vector.reduce.mul.nxv1i32(<vscale x 1 x i32> undef)
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mul_nxv4i32 = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %v0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mul_nxv4i64 = call i64 @llvm.vector.reduce.mul.nxv4i64(<vscale x 4 x i64> %v1)
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %and_nxv1i32 = call i32 @llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32> undef)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %and_nxv4i32 = call i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32> %v0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %and_nxv4i64 = call i64 @llvm.vector.reduce.and.nxv4i64(<vscale x 4 x i64> %v1)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %or_nxv4i32 = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> %v0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %or_nxv4i64 = call i64 @llvm.vector.reduce.or.nxv4i64(<vscale x 4 x i64> %v1)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %xor_nxv4i32 = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> %v0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %xor_nxv4i64 = call i64 @llvm.vector.reduce.xor.nxv4i64(<vscale x 4 x i64> %v1)
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %umin_nxv1i64 = call i64 @llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64> undef)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %umin_nxv4i32 = call i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> %v0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %umin_nxv4i64 = call i64 @llvm.vector.reduce.umin.nxv4i64(<vscale x 4 x i64> %v1)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %smin_nxv4i32 = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> %v0)
Expand All @@ -161,24 +172,31 @@ define void @reductions(<vscale x 4 x i32> %v0, <vscale x 4 x i64> %v1, <vscale
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %umax_nxv4i64 = call i64 @llvm.vector.reduce.umax.nxv4i64(<vscale x 4 x i64> %v1)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %smax_nxv4i32 = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> %v0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %smax_nxv4i64 = call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> %v1)
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %fadd_nxv1f32 = call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.000000e+00, <vscale x 1 x float> undef)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_nxv4f32 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %v2)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_nxv4f64 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v3)
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %fmin_nxv1f32 = call fast float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> undef)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmin_nxv4f32 = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> %v2)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmin_nxv4f64 = call fast double @llvm.vector.reduce.fmin.nxv4f64(<vscale x 4 x double> %v3)
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %fmax_nxv1f32 = call fast float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> undef)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmax_nxv4f32 = call fast float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> %v2)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmax_nxv4f64 = call fast double @llvm.vector.reduce.fmax.nxv4f64(<vscale x 4 x double> %v3)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%add_nxv1i32 = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> undef)
%add_nxv4i32 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %v0)
%add_nxv4i64 = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %v1)
%mul_nxv1i32 = call i32 @llvm.vector.reduce.mul.nxv1i32(<vscale x 1 x i32> undef)
%mul_nxv4i32 = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %v0)
%mul_nxv4i64 = call i64 @llvm.vector.reduce.mul.nxv4i64(<vscale x 4 x i64> %v1)
%and_nxv1i32 = call i32 @llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32> undef)
%and_nxv4i32 = call i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32> %v0)
%and_nxv4i64 = call i64 @llvm.vector.reduce.and.nxv4i64(<vscale x 4 x i64> %v1)
%or_nxv4i32 = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> %v0)
%or_nxv4i64 = call i64 @llvm.vector.reduce.or.nxv4i64(<vscale x 4 x i64> %v1)
%xor_nxv4i32 = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> %v0)
%xor_nxv4i64 = call i64 @llvm.vector.reduce.xor.nxv4i64(<vscale x 4 x i64> %v1)
%umin_nxv1i64 = call i64 @llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64> undef)
%umin_nxv4i32 = call i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> %v0)
%umin_nxv4i64 = call i64 @llvm.vector.reduce.umin.nxv4i64(<vscale x 4 x i64> %v1)
%smin_nxv4i32 = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> %v0)
Expand All @@ -188,10 +206,13 @@ define void @reductions(<vscale x 4 x i32> %v0, <vscale x 4 x i64> %v1, <vscale
%smax_nxv4i32 = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> %v0)
%smax_nxv4i64 = call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> %v1)

%fadd_nxv1f32 = call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.0, <vscale x 1 x float> undef)
%fadd_nxv4f32 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.0, <vscale x 4 x float> %v2)
%fadd_nxv4f64 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.0, <vscale x 4 x double> %v3)
%fmin_nxv1f32 = call fast float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> undef)
%fmin_nxv4f32 = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> %v2)
%fmin_nxv4f64 = call fast double @llvm.vector.reduce.fmin.nxv4f64(<vscale x 4 x double> %v3)
%fmax_nxv1f32 = call fast float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> undef)
%fmax_nxv4f32 = call fast float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> %v2)
%fmax_nxv4f64 = call fast double @llvm.vector.reduce.fmax.nxv4f64(<vscale x 4 x double> %v3)

Expand Down
Loading

0 comments on commit 03874ee

Please sign in to comment.