diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 58145c7e3c5913d..3283cc8a229e5c3 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -112,6 +112,7 @@ class VectorCombine { bool foldExtractedCmps(Instruction &I); bool foldSingleElementStore(Instruction &I); bool scalarizeLoadExtract(Instruction &I); + bool foldPermuteOfBinops(Instruction &I); bool foldShuffleOfBinops(Instruction &I); bool foldShuffleOfCastops(Instruction &I); bool foldShuffleOfShuffles(Instruction &I); @@ -1400,6 +1401,100 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { return true; } +/// Try to convert "shuffle (binop (shuffle, shuffle)), undef" +/// --> "binop (shuffle), (shuffle)". +bool VectorCombine::foldPermuteOfBinops(Instruction &I) { + BinaryOperator *BinOp; + ArrayRef OuterMask; + if (!match(&I, + m_Shuffle(m_OneUse(m_BinOp(BinOp)), m_Undef(), m_Mask(OuterMask)))) + return false; + + // Don't introduce poison into div/rem. + if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem)) + return false; + + Value *Op00, *Op01; + ArrayRef Mask0; + if (!match(BinOp->getOperand(0), + m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0))))) + return false; + + Value *Op10, *Op11; + ArrayRef Mask1; + if (!match(BinOp->getOperand(1), + m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1))))) + return false; + + Instruction::BinaryOps Opcode = BinOp->getOpcode(); + auto *ShuffleDstTy = dyn_cast(I.getType()); + auto *BinOpTy = dyn_cast(BinOp->getType()); + auto *Op0Ty = dyn_cast(Op00->getType()); + auto *Op1Ty = dyn_cast(Op10->getType()); + if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty) + return false; + + unsigned NumSrcElts = BinOpTy->getNumElements(); + + // Don't accept shuffles that reference the second operand in + // div/rem or if its an undef arg. + if ((BinOp->isIntDivRem() || !isa(I.getOperand(1))) && + any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; })) + return false; + + // Merge outer / inner shuffles. + SmallVector NewMask0, NewMask1; + for (int M : OuterMask) { + if (M < 0 || M >= (int)NumSrcElts) { + NewMask0.push_back(PoisonMaskElem); + NewMask1.push_back(PoisonMaskElem); + } else { + NewMask0.push_back(Mask0[M]); + NewMask1.push_back(Mask1[M]); + } + } + + // Try to merge shuffles across the binop if the new shuffles are not costly. + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + + InstructionCost OldCost = + TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) + + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, BinOpTy, + OuterMask, CostKind, 0, nullptr, {BinOp}, &I) + + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty, Mask0, + CostKind, 0, nullptr, {Op00, Op01}, + cast(BinOp->getOperand(0))) + + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty, Mask1, + CostKind, 0, nullptr, {Op10, Op11}, + cast(BinOp->getOperand(1))); + + InstructionCost NewCost = + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty, NewMask0, + CostKind, 0, nullptr, {Op00, Op01}) + + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty, NewMask1, + CostKind, 0, nullptr, {Op10, Op11}) + + TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind); + + LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I + << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost + << "\n"); + if (NewCost >= OldCost) + return false; + + Value *Shuf0 = Builder.CreateShuffleVector(Op00, Op01, NewMask0); + Value *Shuf1 = Builder.CreateShuffleVector(Op10, Op11, NewMask1); + Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1); + + // Intersect flags from the old binops. + if (auto *NewInst = dyn_cast(NewBO)) + NewInst->copyIRFlags(BinOp); + + Worklist.pushValue(Shuf0); + Worklist.pushValue(Shuf1); + replaceValue(I, *NewBO); + return true; +} + /// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)". bool VectorCombine::foldShuffleOfBinops(Instruction &I) { BinaryOperator *B0, *B1; @@ -2736,6 +2831,7 @@ bool VectorCombine::run() { MadeChange |= foldInsExtFNeg(I); break; case Instruction::ShuffleVector: + MadeChange |= foldPermuteOfBinops(I); MadeChange |= foldShuffleOfBinops(I); MadeChange |= foldShuffleOfCastops(I); MadeChange |= foldShuffleOfShuffles(I); diff --git a/llvm/test/Transforms/PhaseOrdering/X86/horiz-math-inseltpoison.ll b/llvm/test/Transforms/PhaseOrdering/X86/horiz-math-inseltpoison.ll index 1d1c9d1f1d18c31..324503a30783d12 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/horiz-math-inseltpoison.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/horiz-math-inseltpoison.ll @@ -108,11 +108,10 @@ define <8 x float> @hadd_reverse_v8f32(<8 x float> %a, <8 x float> %b) #0 { define <8 x float> @reverse_hadd_v8f32(<8 x float> %a, <8 x float> %b) #0 { ; CHECK-LABEL: @reverse_hadd_v8f32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <8 x i32> -; CHECK-NEXT: ret <8 x float> [[SHUFFLE]] +; CHECK-NEXT: ret <8 x float> [[TMP3]] ; %vecext = extractelement <8 x float> %a, i32 0 %vecext1 = extractelement <8 x float> %a, i32 1 diff --git a/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll b/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll index 4f8f04ec42497b8..9d3b69218313e89 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll @@ -108,11 +108,10 @@ define <8 x float> @hadd_reverse_v8f32(<8 x float> %a, <8 x float> %b) #0 { define <8 x float> @reverse_hadd_v8f32(<8 x float> %a, <8 x float> %b) #0 { ; CHECK-LABEL: @reverse_hadd_v8f32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <8 x i32> -; CHECK-NEXT: ret <8 x float> [[SHUFFLE]] +; CHECK-NEXT: ret <8 x float> [[TMP3]] ; %vecext = extractelement <8 x float> %a, i32 0 %vecext1 = extractelement <8 x float> %a, i32 1 diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll index 4a024cc4c0309c1..53d4b1ad96cb827 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll @@ -32,10 +32,9 @@ define <4 x double> @PR50392(<4 x double> %a, <4 x double> %b) { ; AVX1-NEXT: ret <4 x double> [[SHUFFLE]] ; ; AVX2-LABEL: @PR50392( -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; AVX2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> +; AVX2-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] ; AVX2-NEXT: [[SHIFT:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP5:%.*]] = fadd <4 x double> [[B]], [[SHIFT]] ; AVX2-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP5]], <4 x i32> diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll index 1d4cee45b668565..6ff68f50db1b7a8 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll @@ -16,12 +16,18 @@ define <4 x double> @PR94546(<4 x double> %a, <4 x double> %b) { ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> ; SSE-NEXT: ret <4 x double> [[TMP4]] ; -; AVX-LABEL: @PR94546( -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; AVX-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; AVX-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> -; AVX-NEXT: ret <4 x double> [[TMP4]] +; AVX1-LABEL: @PR94546( +; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> +; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; AVX1-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] +; AVX1-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> +; AVX1-NEXT: ret <4 x double> [[TMP4]] +; +; AVX2-LABEL: @PR94546( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] +; AVX2-NEXT: ret <4 x double> [[TMP3]] ; %vecext = extractelement <4 x double> %a, i32 0 %vecext1 = extractelement <4 x double> %a, i32 1 @@ -43,5 +49,4 @@ define <4 x double> @PR94546(<4 x double> %a, <4 x double> %b) { ret <4 x double> %shuffle } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; AVX1: {{.*}} -; AVX2: {{.*}} +; AVX: {{.*}} diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll index 66fe11369d88bea..459ede173b841ae 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll @@ -937,10 +937,9 @@ define <4 x i64> @cast_mismatched_types(<4 x i32> %x) { define <4 x float> @fadd_mismatched_types(<4 x float> %x, <4 x float> %y) { ; CHECK-LABEL: @fadd_mismatched_types( -; CHECK-NEXT: [[SHUF_X:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <2 x i32> -; CHECK-NEXT: [[SHUF_Y:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> poison, <2 x i32> -; CHECK-NEXT: [[FADD:%.*]] = fadd fast <2 x float> [[SHUF_X]], [[SHUF_Y]] -; CHECK-NEXT: [[EXTSHUF:%.*]] = shufflevector <2 x float> [[FADD]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[EXTSHUF:%.*]] = fadd fast <4 x float> [[TMP1]], [[TMP2]] ; CHECK-NEXT: ret <4 x float> [[EXTSHUF]] ; %shuf.x = shufflevector <4 x float> %x, <4 x float> poison, <2 x i32> diff --git a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll index e94868c7b9e5b38..8db1990dcbb5d8a 100644 --- a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll +++ b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll @@ -9,11 +9,10 @@ declare void @use_v4f64(<4 x double>) define <4 x double> @fadd_v4f64(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: define <4 x double> @fadd_v4f64( ; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> ; CHECK-NEXT: [[POST:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[POST1:%.*]] = shufflevector <4 x double> [[POST]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: ret <4 x double> [[POST1]] +; CHECK-NEXT: ret <4 x double> [[POST]] ; %a1 = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> %b1 = shufflevector <4 x double> %b, <4 x double> poison, <4 x i32> @@ -25,11 +24,10 @@ define <4 x double> @fadd_v4f64(<4 x double> %a, <4 x double> %b) { define <4 x double> @fadd_v4f64_poison_idx(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: define <4 x double> @fadd_v4f64_poison_idx( ; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> ; CHECK-NEXT: [[POST:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[POST1:%.*]] = shufflevector <4 x double> [[POST]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: ret <4 x double> [[POST1]] +; CHECK-NEXT: ret <4 x double> [[POST]] ; %a1 = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> %b1 = shufflevector <4 x double> %b, <4 x double> poison, <4 x i32> @@ -41,11 +39,10 @@ define <4 x double> @fadd_v4f64_poison_idx(<4 x double> %a, <4 x double> %b) { define <4 x double> @fadd_mixed_types(<4 x double> %a, <2 x double> %b) { ; CHECK-LABEL: define <4 x double> @fadd_mixed_types( ; CHECK-SAME: <4 x double> [[A:%.*]], <2 x double> [[B:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[B]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[B]], <2 x double> poison, <4 x i32> ; CHECK-NEXT: [[POST:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[POST1:%.*]] = shufflevector <4 x double> [[POST]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: ret <4 x double> [[POST1]] +; CHECK-NEXT: ret <4 x double> [[POST]] ; %a1 = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> %b1 = shufflevector <2 x double> %b, <2 x double> poison, <4 x i32> @@ -95,11 +92,10 @@ define <4 x double> @fadd_v4f64_multiuse_shuffle(<4 x double> %a, <4 x double> % define <4 x i32> @sdiv_v4i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: define <4 x i32> @sdiv_v4i32( ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[POST:%.*]] = sdiv <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[POST1:%.*]] = shufflevector <4 x i32> [[POST]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[POST1]] +; CHECK-NEXT: ret <4 x i32> [[POST]] ; %a1 = shufflevector <4 x i32> %a, <4 x i32> poison, <4 x i32> %b1 = shufflevector <4 x i32> %b, <4 x i32> poison, <4 x i32>