From 6ba3fb1b0896c1271dfe1902ef906abbb4707848 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 22 Aug 2024 07:30:39 -0700 Subject: [PATCH] [DAG][RISCV] Use vp_reduce_* when widening illegal types for reductions (#105455) This allows the use a single wider operation with a restricted EVL instead of padding the vector with the neutral element. For RISCV specifically, it's worth noting that an alternate padded lowering is available when VL is one less than a power of two, and LMUL <= m1. We could slide the vector operand up by one, and insert the padding via a vslide1up. We don't currently pattern match this, but we could. This form would arguably be better iff the surrounding code wanted VL=4. This patch will force a VL toggle in that case instead. Basically, it comes down to a question of whether we think odd sized vectors are going to appear clustered with odd size vector operations, or mixed in with larger power of two operations. Note there is a potential downside of using vp nodes; we loose any generic DAG combines which might have applied to the widened form. --- llvm/include/llvm/IR/VPIntrinsics.def | 29 ++-- .../SelectionDAG/LegalizeVectorTypes.cpp | 43 ++++- .../rvv/fixed-vectors-reduction-formation.ll | 160 ++++++------------ .../RISCV/rvv/fixed-vectors-reduction-fp.ll | 52 +++--- .../RISCV/rvv/fixed-vectors-reduction-int.ll | 34 ++-- .../RISCV/rvv/vreductions-fp-sdnode.ll | 42 ++--- 6 files changed, 158 insertions(+), 202 deletions(-) diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def index a4a1000d37259ed..9333f6be5b516d2 100644 --- a/llvm/include/llvm/IR/VPIntrinsics.def +++ b/llvm/include/llvm/IR/VPIntrinsics.def @@ -651,63 +651,64 @@ END_REGISTER_VP(vp_gather, VP_GATHER) #error \ "The internal helper macro HELPER_REGISTER_REDUCTION_VP is already defined!" #endif -#define HELPER_REGISTER_REDUCTION_VP(VPID, VPSD, INTRIN) \ +#define HELPER_REGISTER_REDUCTION_VP(VPID, VPSD, INTRIN, SDOPC) \ BEGIN_REGISTER_VP(VPID, 2, 3, VPSD, 1) \ VP_PROPERTY_FUNCTIONAL_INTRINSIC(INTRIN) \ + VP_PROPERTY_FUNCTIONAL_SDOPC(SDOPC) \ VP_PROPERTY_REDUCTION(0, 1) \ END_REGISTER_VP(VPID, VPSD) // llvm.vp.reduce.add(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_add, VP_REDUCE_ADD, - vector_reduce_add) + vector_reduce_add, VECREDUCE_ADD) // llvm.vp.reduce.mul(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_mul, VP_REDUCE_MUL, - vector_reduce_mul) + vector_reduce_mul, VECREDUCE_MUL) // llvm.vp.reduce.and(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_and, VP_REDUCE_AND, - vector_reduce_and) + vector_reduce_and, VECREDUCE_AND) // llvm.vp.reduce.or(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_or, VP_REDUCE_OR, - vector_reduce_or) + vector_reduce_or, VECREDUCE_OR) // llvm.vp.reduce.xor(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_xor, VP_REDUCE_XOR, - vector_reduce_xor) + vector_reduce_xor, VECREDUCE_XOR) // llvm.vp.reduce.smax(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_smax, VP_REDUCE_SMAX, - vector_reduce_smax) + vector_reduce_smax, VECREDUCE_SMAX) // llvm.vp.reduce.smin(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_smin, VP_REDUCE_SMIN, - vector_reduce_smin) + vector_reduce_smin, VECREDUCE_SMIN) // llvm.vp.reduce.umax(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_umax, VP_REDUCE_UMAX, - vector_reduce_umax) + vector_reduce_umax, VECREDUCE_UMAX) // llvm.vp.reduce.umin(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_umin, VP_REDUCE_UMIN, - vector_reduce_umin) + vector_reduce_umin, VECREDUCE_UMIN) // llvm.vp.reduce.fmax(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_fmax, VP_REDUCE_FMAX, - vector_reduce_fmax) + vector_reduce_fmax, VECREDUCE_FMAX) // llvm.vp.reduce.fmin(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_fmin, VP_REDUCE_FMIN, - vector_reduce_fmin) + vector_reduce_fmin, VECREDUCE_FMIN) // llvm.vp.reduce.fmaximum(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_fmaximum, VP_REDUCE_FMAXIMUM, - vector_reduce_fmaximum) + vector_reduce_fmaximum, VECREDUCE_FMAXIMUM) // llvm.vp.reduce.fminimum(start,x,mask,vlen) HELPER_REGISTER_REDUCTION_VP(vp_reduce_fminimum, VP_REDUCE_FMINIMUM, - vector_reduce_fminimum) + vector_reduce_fminimum, VECREDUCE_FMINIMUM) #undef HELPER_REGISTER_REDUCTION_VP diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 8315efcb6750f98..5745c147e3502d5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -7271,9 +7271,29 @@ SDValue DAGTypeLegalizer::WidenVecOp_STRICT_FSETCC(SDNode *N) { return DAG.getBuildVector(VT, dl, Scalars); } +static unsigned getExtendForIntVecReduction(unsigned Opc) { + switch (Opc) { + default: + llvm_unreachable("Expected integer vector reduction"); + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_MUL: + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + return ISD::ANY_EXTEND; + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + return ISD::SIGN_EXTEND; + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + return ISD::ZERO_EXTEND; + } +} + SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) { SDLoc dl(N); SDValue Op = GetWidenedVector(N->getOperand(0)); + EVT VT = N->getValueType(0); EVT OrigVT = N->getOperand(0).getValueType(); EVT WideVT = Op.getValueType(); EVT ElemVT = OrigVT.getVectorElementType(); @@ -7288,6 +7308,25 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) { unsigned OrigElts = OrigVT.getVectorMinNumElements(); unsigned WideElts = WideVT.getVectorMinNumElements(); + // Generate a vp.reduce_op if it is custom/legal for the target. This avoids + // needing to pad the source vector, because the inactive lanes can simply be + // disabled and not contribute to the result. + // TODO: VECREDUCE_FADD, VECREDUCE_FMUL aren't currently mapped correctly, + // and thus don't take this path. + if (auto VPOpcode = ISD::getVPForBaseOpcode(Opc); + VPOpcode && TLI.isOperationLegalOrCustom(*VPOpcode, WideVT)) { + SDValue Start = NeutralElem; + if (VT.isInteger()) + Start = DAG.getNode(getExtendForIntVecReduction(Opc), dl, VT, Start); + assert(Start.getValueType() == VT); + EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + WideVT.getVectorElementCount()); + SDValue Mask = DAG.getAllOnesConstant(dl, WideMaskVT); + SDValue EVL = DAG.getElementCount(dl, TLI.getVPExplicitVectorLengthTy(), + OrigVT.getVectorElementCount()); + return DAG.getNode(*VPOpcode, dl, VT, {Start, Op, Mask, EVL}, Flags); + } + if (WideVT.isScalableVector()) { unsigned GCD = std::gcd(OrigElts, WideElts); EVT SplatVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, @@ -7296,14 +7335,14 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) { for (unsigned Idx = OrigElts; Idx < WideElts; Idx = Idx + GCD) Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Op, SplatNeutral, DAG.getVectorIdxConstant(Idx, dl)); - return DAG.getNode(Opc, dl, N->getValueType(0), Op, Flags); + return DAG.getNode(Opc, dl, VT, Op, Flags); } for (unsigned Idx = OrigElts; Idx < WideElts; Idx++) Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, WideVT, Op, NeutralElem, DAG.getVectorIdxConstant(Idx, dl)); - return DAG.getNode(Opc, dl, N->getValueType(0), Op, Flags); + return DAG.getNode(Opc, dl, VT, Op, Flags); } SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE_SEQ(SDNode *N) { diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll index c0bd49cc9c5cbfa..fa56412e71c6787 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll @@ -124,7 +124,7 @@ define i32 @reduce_sum_16xi32_prefix3(ptr %p) { ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vslideup.vi v8, v9, 3 +; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma ; CHECK-NEXT: vredsum.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -160,16 +160,10 @@ define i32 @reduce_sum_16xi32_prefix4(ptr %p) { define i32 @reduce_sum_16xi32_prefix5(ptr %p) { ; CHECK-LABEL: reduce_sum_16xi32_prefix5: ; CHECK: # %bb.0: -; CHECK-NEXT: li a1, -32 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmv.v.i v10, -1 -; CHECK-NEXT: vmerge.vim v10, v10, 0, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vsext.vf4 v12, v10 -; CHECK-NEXT: vand.vv v8, v8, v12 ; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma ; CHECK-NEXT: vredsum.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -189,16 +183,10 @@ define i32 @reduce_sum_16xi32_prefix5(ptr %p) { define i32 @reduce_sum_16xi32_prefix6(ptr %p) { ; CHECK-LABEL: reduce_sum_16xi32_prefix6: ; CHECK: # %bb.0: -; CHECK-NEXT: li a1, 192 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmv.v.i v10, -1 -; CHECK-NEXT: vmerge.vim v10, v10, 0, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vsext.vf4 v12, v10 -; CHECK-NEXT: vand.vv v8, v8, v12 ; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vsetivli zero, 6, e32, m2, ta, ma ; CHECK-NEXT: vredsum.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -223,7 +211,7 @@ define i32 @reduce_sum_16xi32_prefix7(ptr %p) { ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vslideup.vi v8, v10, 7 +; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma ; CHECK-NEXT: vredsum.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -277,15 +265,8 @@ define i32 @reduce_sum_16xi32_prefix9(ptr %p) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: li a0, -512 -; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vmv.v.i v12, -1 -; CHECK-NEXT: vmerge.vim v12, v12, 0, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vsext.vf4 v16, v12 -; CHECK-NEXT: vand.vv v8, v8, v16 ; CHECK-NEXT: vmv.s.x v12, zero +; CHECK-NEXT: vsetivli zero, 9, e32, m4, ta, ma ; CHECK-NEXT: vredsum.vs v8, v8, v12 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -315,15 +296,8 @@ define i32 @reduce_sum_16xi32_prefix13(ptr %p) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: lui a0, 14 -; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vmv.v.i v12, -1 -; CHECK-NEXT: vmerge.vim v12, v12, 0, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vsext.vf4 v16, v12 -; CHECK-NEXT: vand.vv v8, v8, v16 ; CHECK-NEXT: vmv.s.x v12, zero +; CHECK-NEXT: vsetivli zero, 13, e32, m4, ta, ma ; CHECK-NEXT: vredsum.vs v8, v8, v12 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -362,15 +336,8 @@ define i32 @reduce_sum_16xi32_prefix14(ptr %p) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: lui a0, 12 -; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vmv.v.i v12, -1 -; CHECK-NEXT: vmerge.vim v12, v12, 0, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vsext.vf4 v16, v12 -; CHECK-NEXT: vand.vv v8, v8, v16 ; CHECK-NEXT: vmv.s.x v12, zero +; CHECK-NEXT: vsetivli zero, 14, e32, m4, ta, ma ; CHECK-NEXT: vredsum.vs v8, v8, v12 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -411,7 +378,7 @@ define i32 @reduce_sum_16xi32_prefix15(ptr %p) { ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmv.s.x v12, zero -; CHECK-NEXT: vslideup.vi v8, v12, 15 +; CHECK-NEXT: vsetivli zero, 15, e32, m4, ta, ma ; CHECK-NEXT: vredsum.vs v8, v8, v12 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -532,16 +499,10 @@ define i32 @reduce_xor_16xi32_prefix2(ptr %p) { define i32 @reduce_xor_16xi32_prefix5(ptr %p) { ; CHECK-LABEL: reduce_xor_16xi32_prefix5: ; CHECK: # %bb.0: -; CHECK-NEXT: li a1, -32 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmv.v.i v10, -1 -; CHECK-NEXT: vmerge.vim v10, v10, 0, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vsext.vf4 v12, v10 -; CHECK-NEXT: vand.vv v8, v8, v12 ; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma ; CHECK-NEXT: vredxor.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -578,15 +539,10 @@ define i32 @reduce_and_16xi32_prefix5(ptr %p) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 5, e32, m1, ta, ma ; CHECK-NEXT: vmv.v.i v10, -1 -; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 5 -; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 6 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 7 -; CHECK-NEXT: vredand.vs v8, v8, v8 +; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma +; CHECK-NEXT: vredand.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 @@ -620,16 +576,11 @@ define i32 @reduce_or_16xi32_prefix2(ptr %p) { define i32 @reduce_or_16xi32_prefix5(ptr %p) { ; CHECK-LABEL: reduce_or_16xi32_prefix5: ; CHECK: # %bb.0: -; CHECK-NEXT: li a1, -32 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmv.v.i v10, -1 -; CHECK-NEXT: vmerge.vim v10, v10, 0, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vsext.vf4 v12, v10 -; CHECK-NEXT: vand.vv v8, v8, v12 -; CHECK-NEXT: vredor.vs v8, v8, v8 +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma +; CHECK-NEXT: vredor.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 @@ -672,13 +623,8 @@ define i32 @reduce_smax_16xi32_prefix5(ptr %p) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a0, 524288 ; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 5 -; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 6 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 7 -; CHECK-NEXT: vredmax.vs v8, v8, v8 +; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma +; CHECK-NEXT: vredmax.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 @@ -712,18 +658,13 @@ define i32 @reduce_smin_16xi32_prefix2(ptr %p) { define i32 @reduce_smin_16xi32_prefix5(ptr %p) { ; CHECK-LABEL: reduce_smin_16xi32_prefix5: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, 524288 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vmv.s.x v10, a1 -; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 5 -; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 6 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 7 -; CHECK-NEXT: vredmin.vs v8, v8, v8 +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma +; CHECK-NEXT: vredmin.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 @@ -757,16 +698,11 @@ define i32 @reduce_umax_16xi32_prefix2(ptr %p) { define i32 @reduce_umax_16xi32_prefix5(ptr %p) { ; CHECK-LABEL: reduce_umax_16xi32_prefix5: ; CHECK: # %bb.0: -; CHECK-NEXT: li a1, -32 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmv.v.i v10, -1 -; CHECK-NEXT: vmerge.vim v10, v10, 0, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vsext.vf4 v12, v10 -; CHECK-NEXT: vand.vv v8, v8, v12 -; CHECK-NEXT: vredmaxu.vs v8, v8, v8 +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vsetivli zero, 5, e32, m2, ta, ma +; CHECK-NEXT: vredmaxu.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 @@ -798,21 +734,27 @@ define i32 @reduce_umin_16xi32_prefix2(ptr %p) { } define i32 @reduce_umin_16xi32_prefix5(ptr %p) { -; CHECK-LABEL: reduce_umin_16xi32_prefix5: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma -; CHECK-NEXT: vmv.v.i v10, -1 -; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 5 -; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 6 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 7 -; CHECK-NEXT: vredminu.vs v8, v8, v8 -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: ret +; RV32-LABEL: reduce_umin_16xi32_prefix5: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 5, e32, m1, ta, ma +; RV32-NEXT: vmv.v.i v10, -1 +; RV32-NEXT: vsetivli zero, 5, e32, m2, ta, ma +; RV32-NEXT: vredminu.vs v8, v8, v10 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: reduce_umin_16xi32_prefix5: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: li a0, -1 +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetivli zero, 5, e32, m2, ta, ma +; RV64-NEXT: vredminu.vs v8, v8, v10 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: ret %v = load <16 x i32>, ptr %p, align 256 %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll index e9e147861df5641..26dc11aef2805b7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll @@ -1318,10 +1318,7 @@ define float @vreduce_fmin_v7f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vmv.v.v v12, v8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v12, v10, 7 -; CHECK-NEXT: vfredmin.vs v8, v12, v8 +; CHECK-NEXT: vfredmin.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <7 x float>, ptr %x @@ -1568,10 +1565,7 @@ define float @vreduce_fmax_v7f32(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a0, 1047552 ; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vmv.v.v v12, v8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v12, v10, 7 -; CHECK-NEXT: vfredmax.vs v8, v12, v8 +; CHECK-NEXT: vfredmax.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <7 x float>, ptr %x @@ -1771,20 +1765,20 @@ define float @vreduce_fminimum_v7f32(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: lui a0, 522240 -; CHECK-NEXT: vmv.s.x v12, a0 -; CHECK-NEXT: vmv.v.v v10, v8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v10, v12, 7 -; CHECK-NEXT: vmfne.vv v9, v10, v10 -; CHECK-NEXT: vcpop.m a0, v9 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma +; CHECK-NEXT: vmfne.vv v10, v8, v8 +; CHECK-NEXT: vcpop.m a0, v10, v0.t ; CHECK-NEXT: beqz a0, .LBB108_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB108_2: -; CHECK-NEXT: vfredmin.vs v8, v10, v8 +; CHECK-NEXT: lui a0, 522240 +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vfredmin.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <7 x float>, ptr %x @@ -1799,10 +1793,7 @@ define float @vreduce_fminimum_v7f32_nonans(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a0, 522240 ; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vmv.v.v v12, v8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v12, v10, 7 -; CHECK-NEXT: vfredmin.vs v8, v12, v8 +; CHECK-NEXT: vfredmin.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <7 x float>, ptr %x @@ -2527,20 +2518,20 @@ define float @vreduce_fmaximum_v7f32(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: lui a0, 1046528 -; CHECK-NEXT: vmv.s.x v12, a0 -; CHECK-NEXT: vmv.v.v v10, v8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v10, v12, 7 -; CHECK-NEXT: vmfne.vv v9, v10, v10 -; CHECK-NEXT: vcpop.m a0, v9 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vmset.m v0 +; CHECK-NEXT: vsetivli zero, 7, e32, m2, ta, ma +; CHECK-NEXT: vmfne.vv v10, v8, v8 +; CHECK-NEXT: vcpop.m a0, v10, v0.t ; CHECK-NEXT: beqz a0, .LBB136_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lui a0, 523264 ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB136_2: -; CHECK-NEXT: vfredmax.vs v8, v10, v8 +; CHECK-NEXT: lui a0, 1046528 +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vfredmax.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <7 x float>, ptr %x @@ -2555,10 +2546,7 @@ define float @vreduce_fmaximum_v7f32_nonans(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lui a0, 1046528 ; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vmv.v.v v12, v8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v12, v10, 7 -; CHECK-NEXT: vfredmax.vs v8, v12, v8 +; CHECK-NEXT: vfredmax.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <7 x float>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll index 29d80979808a9cd..56944e2aa5074d4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll @@ -40,8 +40,6 @@ define i8 @vreduce_add_v3i8(ptr %x) { ; CHECK-NEXT: vsetivli zero, 3, e8, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 3 ; CHECK-NEXT: vredsum.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -1768,10 +1766,9 @@ define i8 @vreduce_and_v3i8(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 3, e8, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vslideup.vi v8, v9, 3 -; CHECK-NEXT: vredand.vs v8, v8, v8 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vredand.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <3 x i8>, ptr %x @@ -2373,9 +2370,7 @@ define i8 @vreduce_or_v3i8(ptr %x) { ; CHECK-NEXT: vsetivli zero, 3, e8, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 3 -; CHECK-NEXT: vredor.vs v8, v8, v8 +; CHECK-NEXT: vredor.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <3 x i8>, ptr %x @@ -2977,8 +2972,6 @@ define i8 @vreduce_xor_v3i8(ptr %x) { ; CHECK-NEXT: vsetivli zero, 3, e8, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 3 ; CHECK-NEXT: vredxor.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret @@ -3613,9 +3606,7 @@ define i8 @vreduce_smin_v3i8(ptr %x) { ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: li a0, 127 ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 3 -; CHECK-NEXT: vredmin.vs v8, v8, v8 +; CHECK-NEXT: vredmin.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <3 x i8>, ptr %x @@ -4217,9 +4208,7 @@ define i8 @vreduce_smax_v3i8(ptr %x) { ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: li a0, -128 ; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 3 -; CHECK-NEXT: vredmax.vs v8, v8, v8 +; CHECK-NEXT: vredmax.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <3 x i8>, ptr %x @@ -4819,10 +4808,9 @@ define i8 @vreduce_umin_v3i8(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 3, e8, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmv.v.i v9, -1 -; CHECK-NEXT: vslideup.vi v8, v9, 3 -; CHECK-NEXT: vredminu.vs v8, v8, v8 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vredminu.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <3 x i8>, ptr %x @@ -5423,9 +5411,7 @@ define i8 @vreduce_umax_v3i8(ptr %x) { ; CHECK-NEXT: vsetivli zero, 3, e8, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 3 -; CHECK-NEXT: vredmaxu.vs v8, v8, v8 +; CHECK-NEXT: vredmaxu.vs v8, v8, v9 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <3 x i8>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll index 30e31cecbf2c7b8..5b140299070b945 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll @@ -1018,22 +1018,17 @@ declare half @llvm.vector.reduce.fmin.nxv10f16() define half @vreduce_fmin_nxv10f16( %v) { ; CHECK-LABEL: vreduce_fmin_nxv10f16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI73_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI73_0)(a0) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: add a1, a0, a0 -; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; CHECK-NEXT: vfmv.v.f v12, fa5 -; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vx v10, v12, a0 -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma -; CHECK-NEXT: vmv.v.v v11, v12 -; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vx v11, v12, a0 -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfredmin.vs v8, v8, v8 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: lui a1, %hi(.LCPI73_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI73_0) +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v12, (a1) +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: li a1, 10 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfredmin.vs v12, v8, v12 +; CHECK-NEXT: vfmv.f.s fa0, v12 ; CHECK-NEXT: ret %red = call half @llvm.vector.reduce.fmin.nxv10f16( %v) ret half %red @@ -1044,12 +1039,17 @@ declare half @llvm.vector.reduce.fmax.nxv12f16() define half @vreduce_fmax_nxv12f16( %v) { ; CHECK-LABEL: vreduce_fmax_nxv12f16: ; CHECK: # %bb.0: -; CHECK-NEXT: li a0, -512 -; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; CHECK-NEXT: vmv.v.x v11, a0 -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfredmax.vs v8, v8, v8 -; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: slli a1, a0, 2 +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub a0, a0, a1 +; CHECK-NEXT: li a1, -512 +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; CHECK-NEXT: vmv.s.x v12, a1 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfredmax.vs v12, v8, v12 +; CHECK-NEXT: vfmv.f.s fa0, v12 ; CHECK-NEXT: ret %red = call half @llvm.vector.reduce.fmax.nxv12f16( %v) ret half %red