diff --git a/clang/include/clang/StaticAnalyzer/Frontend/CheckerRegistry.h b/clang/include/clang/StaticAnalyzer/Frontend/CheckerRegistry.h index 4e98ba2e10d233..c3494d0ebeefd7 100644 --- a/clang/include/clang/StaticAnalyzer/Frontend/CheckerRegistry.h +++ b/clang/include/clang/StaticAnalyzer/Frontend/CheckerRegistry.h @@ -13,6 +13,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/raw_ostream.h" #include #include @@ -133,6 +134,9 @@ class CheckerRegistry { DevelopmentStatus == "released") && "Invalid development status!"); } + + LLVM_DUMP_METHOD void dump() const { dumpToStream(llvm::errs()); } + LLVM_DUMP_METHOD void dumpToStream(llvm::raw_ostream &Out) const; }; using CmdLineOptionList = llvm::SmallVector; @@ -189,6 +193,9 @@ class CheckerRegistry { // Used for lower_bound. explicit CheckerInfo(StringRef FullName) : FullName(FullName) {} + + LLVM_DUMP_METHOD void dump() const { dumpToStream(llvm::errs()); } + LLVM_DUMP_METHOD void dumpToStream(llvm::raw_ostream &Out) const; }; using StateFromCmdLine = CheckerInfo::StateFromCmdLine; @@ -206,6 +213,9 @@ class CheckerRegistry { } explicit PackageInfo(StringRef FullName) : FullName(FullName) {} + + LLVM_DUMP_METHOD void dump() const { dumpToStream(llvm::errs()); } + LLVM_DUMP_METHOD void dumpToStream(llvm::raw_ostream &Out) const; }; using PackageInfoList = llvm::SmallVector; diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index 9a340142a24281..ac9eb46dacb512 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -449,10 +449,9 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("-export-dynamic"); if (!Args.hasArg(options::OPT_shared) && !IsStaticPIE) { - const std::string Loader = - D.DyldPrefix + ToolChain.getDynamicLinker(Args); CmdArgs.push_back("-dynamic-linker"); - CmdArgs.push_back(Args.MakeArgString(Loader)); + CmdArgs.push_back(Args.MakeArgString(Twine(D.DyldPrefix) + + ToolChain.getDynamicLinker(Args))); } } diff --git a/clang/lib/StaticAnalyzer/Frontend/CheckerRegistry.cpp b/clang/lib/StaticAnalyzer/Frontend/CheckerRegistry.cpp index 62ac1ed252dd1e..f4d5db1e7a4b03 100644 --- a/clang/lib/StaticAnalyzer/Frontend/CheckerRegistry.cpp +++ b/clang/lib/StaticAnalyzer/Frontend/CheckerRegistry.cpp @@ -27,6 +27,10 @@ using namespace clang; using namespace ento; using llvm::sys::DynamicLibrary; +//===----------------------------------------------------------------------===// +// Utilities. +//===----------------------------------------------------------------------===// + using RegisterCheckersFn = void (*)(CheckerRegistry &); static bool isCompatibleAPIVersion(const char *VersionString) { @@ -86,6 +90,63 @@ static bool isInPackage(const CheckerRegistry::CheckerInfo &Checker, return false; } +//===----------------------------------------------------------------------===// +// Methods of CmdLineOption, PackageInfo and CheckerInfo. +//===----------------------------------------------------------------------===// + +LLVM_DUMP_METHOD void +CheckerRegistry::CmdLineOption::dumpToStream(llvm::raw_ostream &Out) const { + // The description can be just checked in Checkers.inc, the point here is to + // debug whether we succeeded in parsing it. + Out << OptionName << " (" << OptionType << ", " + << (IsHidden ? "hidden, " : "") << DevelopmentStatus << ") default: \"" + << DefaultValStr; +} + +static StringRef toString(CheckerRegistry::StateFromCmdLine Kind) { + switch (Kind) { + case CheckerRegistry::StateFromCmdLine::State_Disabled: + return "Disabled"; + case CheckerRegistry::StateFromCmdLine::State_Enabled: + return "Enabled"; + case CheckerRegistry::StateFromCmdLine::State_Unspecified: + return "Unspecified"; + } +} + +LLVM_DUMP_METHOD void +CheckerRegistry::CheckerInfo::dumpToStream(llvm::raw_ostream &Out) const { + // The description can be just checked in Checkers.inc, the point here is to + // debug whether we succeeded in parsing it. Same with documentation uri. + Out << FullName << " (" << toString(State) << (IsHidden ? ", hidden" : "") + << ")\n"; + Out << " Options:\n"; + for (const CmdLineOption &Option : CmdLineOptions) { + Out << " "; + Option.dumpToStream(Out); + Out << '\n'; + } + Out << " Dependencies:\n"; + for (const CheckerInfo *Dependency : Dependencies) { + Out << " " << Dependency->FullName << '\n'; + } +} + +LLVM_DUMP_METHOD void +CheckerRegistry::PackageInfo::dumpToStream(llvm::raw_ostream &Out) const { + Out << FullName << "\n"; + Out << " Options:\n"; + for (const CmdLineOption &Option : CmdLineOptions) { + Out << " "; + Option.dumpToStream(Out); + Out << '\n'; + } +} + +//===----------------------------------------------------------------------===// +// Methods of CheckerRegistry. +//===----------------------------------------------------------------------===// + CheckerRegistry::CheckerInfoListRange CheckerRegistry::getMutableCheckersForCmdLineArg(StringRef CmdLineArg) { auto It = binaryFind(Checkers, CmdLineArg); diff --git a/compiler-rt/test/asan/TestCases/Linux/preinstalled_signal.cpp b/compiler-rt/test/asan/TestCases/Linux/preinstalled_signal.cpp index 2b50944c6f2f6e..71929fdd9b37fe 100644 --- a/compiler-rt/test/asan/TestCases/Linux/preinstalled_signal.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/preinstalled_signal.cpp @@ -1,16 +1,16 @@ // RUN: %clangxx -std=c++11 %s -o %t -// RUN: env LD_PRELOAD=%shared_libasan %env_asan_opts=handle_segv=1 not %run %t 2>&1 | FileCheck %s -// RUN: env LD_PRELOAD=%shared_libasan %env_asan_opts=handle_segv=2 not %run %t 2>&1 | FileCheck %s +// RUN: %env_asan_opts=handle_segv=1 LD_PRELOAD=%shared_libasan not %run %t 2>&1 | FileCheck %s +// RUN: %env_asan_opts=handle_segv=2 LD_PRELOAD=%shared_libasan not %run %t 2>&1 | FileCheck %s // RUN: %clangxx -std=c++11 -DTEST_INSTALL_SIG_HANDLER %s -o %t -// RUN: env LD_PRELOAD=%shared_libasan %env_asan_opts=handle_segv=0 not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-HANDLER -// RUN: env LD_PRELOAD=%shared_libasan %env_asan_opts=handle_segv=1 not %run %t 2>&1 | FileCheck %s -// RUN: env LD_PRELOAD=%shared_libasan %env_asan_opts=handle_segv=2 not %run %t 2>&1 | FileCheck %s +// RUN: %env_asan_opts=handle_segv=0 LD_PRELOAD=%shared_libasan not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-HANDLER +// RUN: %env_asan_opts=handle_segv=1 LD_PRELOAD=%shared_libasan not %run %t 2>&1 | FileCheck %s +// RUN: %env_asan_opts=handle_segv=2 LD_PRELOAD=%shared_libasan not %run %t 2>&1 | FileCheck %s // RUN: %clangxx -std=c++11 -DTEST_INSTALL_SIG_ACTION %s -o %t -// RUN: env LD_PRELOAD=%shared_libasan %env_asan_opts=handle_segv=0 not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-ACTION -// RUN: env LD_PRELOAD=%shared_libasan %env_asan_opts=handle_segv=1 not %run %t 2>&1 | FileCheck %s -// RUN: env LD_PRELOAD=%shared_libasan %env_asan_opts=handle_segv=2 not %run %t 2>&1 | FileCheck %s +// RUN: %env_asan_opts=handle_segv=0 LD_PRELOAD=%shared_libasan not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-ACTION +// RUN: %env_asan_opts=handle_segv=1 LD_PRELOAD=%shared_libasan not %run %t 2>&1 | FileCheck %s +// RUN: %env_asan_opts=handle_segv=2 LD_PRELOAD=%shared_libasan not %run %t 2>&1 | FileCheck %s // REQUIRES: asan-dynamic-runtime diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1cbfd41dcbc324..86825ce8a446c9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -36908,6 +36908,11 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0, TLO.DAG, DL, ExtSizeInBits)); } + // Target unary shuffles by immediate: + case X86ISD::PSHUFD: + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + case X86ISD::VPERMILPI: // Byte shifts by immediate. case X86ISD::VSHLDQ: case X86ISD::VSRLDQ: diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp index 16108bd1928f60..65caeab1d1cf27 100644 --- a/llvm/lib/Target/X86/X86PartialReduction.cpp +++ b/llvm/lib/Target/X86/X86PartialReduction.cpp @@ -49,11 +49,8 @@ class X86PartialReduction : public FunctionPass { } private: - bool tryMAddPattern(BinaryOperator *BO); - bool tryMAddReplacement(Value *Op, BinaryOperator *Add); - - bool trySADPattern(BinaryOperator *BO); - bool trySADReplacement(Value *Op, BinaryOperator *Add); + bool tryMAddReplacement(Instruction *Op); + bool trySADReplacement(Instruction *Op); }; } @@ -66,139 +63,24 @@ char X86PartialReduction::ID = 0; INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE, "X86 Partial Reduction", false, false) -static bool isVectorReductionOp(const BinaryOperator &BO) { - if (!BO.getType()->isVectorTy()) +bool X86PartialReduction::tryMAddReplacement(Instruction *Op) { + if (!ST->hasSSE2()) return false; - unsigned Opcode = BO.getOpcode(); - - switch (Opcode) { - case Instruction::Add: - case Instruction::Mul: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - break; - case Instruction::FAdd: - case Instruction::FMul: - if (auto *FPOp = dyn_cast(&BO)) - if (FPOp->getFastMathFlags().isFast()) - break; - LLVM_FALLTHROUGH; - default: + // Need at least 8 elements. + if (cast(Op->getType())->getNumElements() < 8) return false; - } - unsigned ElemNum = cast(BO.getType())->getNumElements(); - // Ensure the reduction size is a power of 2. - if (!isPowerOf2_32(ElemNum)) + // Element type should be i32. + if (!cast(Op->getType())->getElementType()->isIntegerTy(32)) return false; - unsigned ElemNumToReduce = ElemNum; - - // Do DFS search on the def-use chain from the given instruction. We only - // allow four kinds of operations during the search until we reach the - // instruction that extracts the first element from the vector: - // - // 1. The reduction operation of the same opcode as the given instruction. - // - // 2. PHI node. - // - // 3. ShuffleVector instruction together with a reduction operation that - // does a partial reduction. - // - // 4. ExtractElement that extracts the first element from the vector, and we - // stop searching the def-use chain here. - // - // 3 & 4 above perform a reduction on all elements of the vector. We push defs - // from 1-3 to the stack to continue the DFS. The given instruction is not - // a reduction operation if we meet any other instructions other than those - // listed above. - - SmallVector UsersToVisit{&BO}; - SmallPtrSet Visited; - bool ReduxExtracted = false; - - while (!UsersToVisit.empty()) { - auto User = UsersToVisit.back(); - UsersToVisit.pop_back(); - if (!Visited.insert(User).second) - continue; - - for (const auto *U : User->users()) { - auto *Inst = dyn_cast(U); - if (!Inst) - return false; - - if (Inst->getOpcode() == Opcode || isa(U)) { - if (auto *FPOp = dyn_cast(Inst)) - if (!isa(FPOp) && !FPOp->getFastMathFlags().isFast()) - return false; - UsersToVisit.push_back(U); - } else if (auto *ShufInst = dyn_cast(U)) { - // Detect the following pattern: A ShuffleVector instruction together - // with a reduction that do partial reduction on the first and second - // ElemNumToReduce / 2 elements, and store the result in - // ElemNumToReduce / 2 elements in another vector. - - unsigned ResultElements = ShufInst->getType()->getNumElements(); - if (ResultElements < ElemNum) - return false; - - if (ElemNumToReduce == 1) - return false; - if (!isa(U->getOperand(1))) - return false; - for (unsigned i = 0; i < ElemNumToReduce / 2; ++i) - if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2)) - return false; - for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i) - if (ShufInst->getMaskValue(i) != -1) - return false; - - // There is only one user of this ShuffleVector instruction, which - // must be a reduction operation. - if (!U->hasOneUse()) - return false; - - auto *U2 = dyn_cast(*U->user_begin()); - if (!U2 || U2->getOpcode() != Opcode) - return false; - - // Check operands of the reduction operation. - if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) || - (U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) { - UsersToVisit.push_back(U2); - ElemNumToReduce /= 2; - } else - return false; - } else if (isa(U)) { - // At this moment we should have reduced all elements in the vector. - if (ElemNumToReduce != 1) - return false; - - auto *Val = dyn_cast(U->getOperand(1)); - if (!Val || !Val->isZero()) - return false; - - ReduxExtracted = true; - } else - return false; - } - } - return ReduxExtracted; -} - -bool X86PartialReduction::tryMAddReplacement(Value *Op, BinaryOperator *Add) { - BasicBlock *BB = Add->getParent(); - - auto *BO = dyn_cast(Op); - if (!BO || BO->getOpcode() != Instruction::Mul || !BO->hasOneUse() || - BO->getParent() != BB) + auto *Mul = dyn_cast(Op); + if (!Mul || Mul->getOpcode() != Instruction::Mul) return false; - Value *LHS = BO->getOperand(0); - Value *RHS = BO->getOperand(1); + Value *LHS = Mul->getOperand(0); + Value *RHS = Mul->getOperand(1); // LHS and RHS should be only used once or if they are the same then only // used twice. Only check this when SSE4.1 is enabled and we have zext/sext @@ -219,7 +101,7 @@ bool X86PartialReduction::tryMAddReplacement(Value *Op, BinaryOperator *Add) { auto CanShrinkOp = [&](Value *Op) { auto IsFreeTruncation = [&](Value *Op) { if (auto *Cast = dyn_cast(Op)) { - if (Cast->getParent() == BB && + if (Cast->getParent() == Mul->getParent() && (Cast->getOpcode() == Instruction::SExt || Cast->getOpcode() == Instruction::ZExt) && Cast->getOperand(0)->getType()->getScalarSizeInBits() <= 16) @@ -232,16 +114,16 @@ bool X86PartialReduction::tryMAddReplacement(Value *Op, BinaryOperator *Add) { // If the operation can be freely truncated and has enough sign bits we // can shrink. if (IsFreeTruncation(Op) && - ComputeNumSignBits(Op, *DL, 0, nullptr, BO) > 16) + ComputeNumSignBits(Op, *DL, 0, nullptr, Mul) > 16) return true; // SelectionDAG has limited support for truncating through an add or sub if // the inputs are freely truncatable. if (auto *BO = dyn_cast(Op)) { - if (BO->getParent() == BB && + if (BO->getParent() == Mul->getParent() && IsFreeTruncation(BO->getOperand(0)) && IsFreeTruncation(BO->getOperand(1)) && - ComputeNumSignBits(Op, *DL, 0, nullptr, BO) > 16) + ComputeNumSignBits(Op, *DL, 0, nullptr, Mul) > 16) return true; } @@ -252,7 +134,7 @@ bool X86PartialReduction::tryMAddReplacement(Value *Op, BinaryOperator *Add) { if (!CanShrinkOp(LHS) && !CanShrinkOp(RHS)) return false; - IRBuilder<> Builder(Add); + IRBuilder<> Builder(Mul); auto *MulTy = cast(Op->getType()); unsigned NumElts = MulTy->getNumElements(); @@ -266,8 +148,11 @@ bool X86PartialReduction::tryMAddReplacement(Value *Op, BinaryOperator *Add) { EvenMask[i] = i * 2; OddMask[i] = i * 2 + 1; } - Value *EvenElts = Builder.CreateShuffleVector(BO, BO, EvenMask); - Value *OddElts = Builder.CreateShuffleVector(BO, BO, OddMask); + // Creating a new mul so the replaceAllUsesWith below doesn't replace the + // uses in the shuffles we're creating. + Value *NewMul = Builder.CreateMul(Mul->getOperand(0), Mul->getOperand(1)); + Value *EvenElts = Builder.CreateShuffleVector(NewMul, NewMul, EvenMask); + Value *OddElts = Builder.CreateShuffleVector(NewMul, NewMul, OddMask); Value *MAdd = Builder.CreateAdd(EvenElts, OddElts); // Concatenate zeroes to extend back to the original type. @@ -276,34 +161,21 @@ bool X86PartialReduction::tryMAddReplacement(Value *Op, BinaryOperator *Add) { Value *Zero = Constant::getNullValue(MAdd->getType()); Value *Concat = Builder.CreateShuffleVector(MAdd, Zero, ConcatMask); - // Replaces the use of mul in the original Add with the pmaddwd and zeroes. - Add->replaceUsesOfWith(BO, Concat); - Add->setHasNoSignedWrap(false); - Add->setHasNoUnsignedWrap(false); + Mul->replaceAllUsesWith(Concat); + Mul->eraseFromParent(); return true; } -// Try to replace operans of this add with pmaddwd patterns. -bool X86PartialReduction::tryMAddPattern(BinaryOperator *BO) { +bool X86PartialReduction::trySADReplacement(Instruction *Op) { if (!ST->hasSSE2()) return false; - // Need at least 8 elements. - if (cast(BO->getType())->getNumElements() < 8) - return false; - - // Element type should be i32. - if (!cast(BO->getType())->getElementType()->isIntegerTy(32)) + // TODO: There's nothing special about i32, any integer type above i16 should + // work just as well. + if (!cast(Op->getType())->getElementType()->isIntegerTy(32)) return false; - bool Changed = false; - Changed |= tryMAddReplacement(BO->getOperand(0), BO); - Changed |= tryMAddReplacement(BO->getOperand(1), BO); - return Changed; -} - -bool X86PartialReduction::trySADReplacement(Value *Op, BinaryOperator *Add) { // Operand should be a select. auto *SI = dyn_cast(Op); if (!SI) @@ -337,7 +209,7 @@ bool X86PartialReduction::trySADReplacement(Value *Op, BinaryOperator *Add) { if (!Op0 || !Op1) return false; - IRBuilder<> Builder(Add); + IRBuilder<> Builder(SI); auto *OpTy = cast(Op->getType()); unsigned NumElts = OpTy->getNumElements(); @@ -355,7 +227,7 @@ bool X86PartialReduction::trySADReplacement(Value *Op, BinaryOperator *Add) { IntrinsicNumElts = 16; } - Function *PSADBWFn = Intrinsic::getDeclaration(Add->getModule(), IID); + Function *PSADBWFn = Intrinsic::getDeclaration(SI->getModule(), IID); if (NumElts < 16) { // Pad input with zeroes. @@ -419,27 +291,155 @@ bool X86PartialReduction::trySADReplacement(Value *Op, BinaryOperator *Add) { Ops[0] = Builder.CreateShuffleVector(Ops[0], Zero, ConcatMask); } - // Replaces the uses of Op in Add with the new sequence. - Add->replaceUsesOfWith(Op, Ops[0]); - Add->setHasNoSignedWrap(false); - Add->setHasNoUnsignedWrap(false); + SI->replaceAllUsesWith(Ops[0]); + SI->eraseFromParent(); return true; } -bool X86PartialReduction::trySADPattern(BinaryOperator *BO) { - if (!ST->hasSSE2()) - return false; +// Walk backwards from the ExtractElementInst and determine if it is the end of +// a horizontal reduction. Return the input to the reduction if we find one. +static Value *matchAddReduction(const ExtractElementInst &EE) { + // Make sure we're extracting index 0. + auto *Index = dyn_cast(EE.getIndexOperand()); + if (!Index || !Index->isNullValue()) + return nullptr; - // TODO: There's nothing special about i32, any integer type above i16 should - // work just as well. - if (!cast(BO->getType())->getElementType()->isIntegerTy(32)) + const auto *BO = dyn_cast(EE.getVectorOperand()); + if (!BO || BO->getOpcode() != Instruction::Add || !BO->hasOneUse()) + return nullptr; + + unsigned NumElems = cast(BO->getType())->getNumElements(); + // Ensure the reduction size is a power of 2. + if (!isPowerOf2_32(NumElems)) + return nullptr; + + const Value *Op = BO; + unsigned Stages = Log2_32(NumElems); + for (unsigned i = 0; i != Stages; ++i) { + const auto *BO = dyn_cast(Op); + if (!BO || BO->getOpcode() != Instruction::Add) + return nullptr; + + // If this isn't the first add, then it should only have 2 users, the + // shuffle and another add which we checked in the previous iteration. + if (i != 0 && !BO->hasNUses(2)) + return nullptr; + + Value *LHS = BO->getOperand(0); + Value *RHS = BO->getOperand(1); + + auto *Shuffle = dyn_cast(LHS); + if (Shuffle) { + Op = RHS; + } else { + Shuffle = dyn_cast(RHS); + Op = LHS; + } + + // The first operand of the shuffle should be the same as the other operand + // of the bin op. + if (!Shuffle || Shuffle->getOperand(0) != Op) + return nullptr; + + // Verify the shuffle has the expected (at this stage of the pyramid) mask. + unsigned MaskEnd = 1 << i; + for (unsigned Index = 0; Index < MaskEnd; ++Index) + if (Shuffle->getMaskValue(Index) != (int)(MaskEnd + Index)) + return nullptr; + } + + return const_cast(Op); +} + +// See if this BO is reachable from this Phi by walking forward through single +// use BinaryOperators with the same opcode. If we get back then we know we've +// found a loop and it is safe to step through this Add to find more leaves. +static bool isReachableFromPHI(PHINode *Phi, BinaryOperator *BO) { + // The PHI itself should only have one use. + if (!Phi->hasOneUse()) return false; - bool Changed = false; - Changed |= trySADReplacement(BO->getOperand(0), BO); - Changed |= trySADReplacement(BO->getOperand(1), BO); - return Changed; + Instruction *U = cast(*Phi->user_begin()); + if (U == BO) + return true; + + while (U->hasOneUse() && U->getOpcode() == BO->getOpcode()) + U = cast(*U->user_begin()); + + return U == BO; +} + +// Collect all the leaves of the tree of adds that feeds into the horizontal +// reduction. Root is the Value that is used by the horizontal reduction. +// We look through single use phis, single use adds, or adds that are used by +// a phi that forms a loop with the add. +static void collectLeaves(Value *Root, SmallVectorImpl &Leaves) { + SmallPtrSet Visited; + SmallVector Worklist; + Worklist.push_back(Root); + + while (!Worklist.empty()) { + Value *V = Worklist.pop_back_val(); + if (!Visited.insert(V).second) + continue; + + if (auto *PN = dyn_cast(V)) { + // PHI node should have single use unless it is the root node, then it + // has 2 uses. + if (!PN->hasNUses(PN == Root ? 2 : 1)) + break; + + // Push incoming values to the worklist. + for (Value *InV : PN->incoming_values()) + Worklist.push_back(InV); + + continue; + } + + if (auto *BO = dyn_cast(V)) { + if (BO->getOpcode() == Instruction::Add) { + // Simple case. Single use, just push its operands to the worklist. + if (BO->hasNUses(BO == Root ? 2 : 1)) { + for (Value *Op : BO->operands()) + Worklist.push_back(Op); + continue; + } + + // If there is additional use, make sure it is an unvisited phi that + // gets us back to this node. + if (BO->hasNUses(BO == Root ? 3 : 2)) { + PHINode *PN = nullptr; + for (auto *U : Root->users()) + if (auto *P = dyn_cast(U)) + if (!Visited.count(P)) + PN = P; + + // If we didn't find a 2-input PHI then this isn't a case we can + // handle. + if (!PN || PN->getNumIncomingValues() != 2) + continue; + + // Walk forward from this phi to see if it reaches back to this add. + if (!isReachableFromPHI(PN, BO)) + continue; + + // The phi forms a loop with this Add, push its operands. + for (Value *Op : BO->operands()) + Worklist.push_back(Op); + } + } + } + + // Not an add or phi, make it a leaf. + if (auto *I = dyn_cast(V)) { + if (!V->hasNUses(I == Root ? 2 : 1)) + continue; + + // Add this as a leaf. + Leaves.push_back(I); + } + } } bool X86PartialReduction::runOnFunction(Function &F) { @@ -458,22 +458,29 @@ bool X86PartialReduction::runOnFunction(Function &F) { bool MadeChange = false; for (auto &BB : F) { for (auto &I : BB) { - auto *BO = dyn_cast(&I); - if (!BO) + auto *EE = dyn_cast(&I); + if (!EE) continue; - if (!isVectorReductionOp(*BO)) + // First find a reduction tree. + // FIXME: Do we need to handle other opcodes than Add? + Value *Root = matchAddReduction(*EE); + if (!Root) continue; - if (BO->getOpcode() == Instruction::Add) { - if (tryMAddPattern(BO)) { + SmallVector Leaves; + collectLeaves(Root, Leaves); + + for (Instruction *I : Leaves) { + if (tryMAddReplacement(I)) { MadeChange = true; continue; } - if (trySADPattern(BO)) { + + // Don't do SAD matching on the root node. SelectionDAG already + // has support for that and currently generates better code. + if (I != Root && trySADReplacement(I)) MadeChange = true; - continue; - } } } } diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index fee195ae121fd3..295b5271ed0b4e 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -7742,7 +7742,7 @@ define i64 @test_mm512_reduce_max_epi64(<8 x i64> %__W) { ; X86-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -7755,7 +7755,7 @@ define i64 @test_mm512_reduce_max_epi64(<8 x i64> %__W) { ; X64-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -7781,7 +7781,7 @@ define i64 @test_mm512_reduce_max_epu64(<8 x i64> %__W) { ; X86-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -7794,7 +7794,7 @@ define i64 @test_mm512_reduce_max_epu64(<8 x i64> %__W) { ; X64-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -7867,7 +7867,7 @@ define i64 @test_mm512_reduce_min_epi64(<8 x i64> %__W) { ; X86-NEXT: vpminsq %zmm0, %zmm1, %zmm0 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -7880,7 +7880,7 @@ define i64 @test_mm512_reduce_min_epi64(<8 x i64> %__W) { ; X64-NEXT: vpminsq %zmm0, %zmm1, %zmm0 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -7906,7 +7906,7 @@ define i64 @test_mm512_reduce_min_epu64(<8 x i64> %__W) { ; X86-NEXT: vpminuq %zmm0, %zmm1, %zmm0 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -7919,7 +7919,7 @@ define i64 @test_mm512_reduce_min_epu64(<8 x i64> %__W) { ; X64-NEXT: vpminuq %zmm0, %zmm1, %zmm0 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -7996,7 +7996,7 @@ define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X86-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -8012,7 +8012,7 @@ define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -8043,7 +8043,7 @@ define i64 @test_mm512_mask_reduce_max_epu64(i8 zeroext %__M, <8 x i64> %__W) { ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -8058,7 +8058,7 @@ define i64 @test_mm512_mask_reduce_max_epu64(i8 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -8146,7 +8146,7 @@ define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X86-NEXT: vpminsq %zmm0, %zmm1, %zmm0 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -8162,7 +8162,7 @@ define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpminsq %zmm0, %zmm1, %zmm0 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper @@ -8194,7 +8194,7 @@ define i64 @test_mm512_mask_reduce_min_epu64(i8 zeroext %__M, <8 x i64> %__W) { ; X86-NEXT: vpminuq %zmm0, %zmm1, %zmm0 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx @@ -8210,7 +8210,7 @@ define i64 @test_mm512_mask_reduce_min_epu64(i8 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpminuq %zmm0, %zmm1, %zmm0 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll index d6d04d9b128412..6109bd25c69e2a 100644 --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -2657,9 +2657,9 @@ define i32 @madd_double_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* ; AVX-LABEL: madd_double_reduction: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqu (%rdi), %xmm0 +; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqu (%rdx), %xmm1 ; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1 -; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -2720,9 +2720,9 @@ define i32 @madd_quad_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %a ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovdqu (%rdi), %xmm0 +; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqu (%rdx), %xmm1 ; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1 -; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqu (%r8), %xmm2 ; AVX-NEXT: vpmaddwd (%r9), %xmm2, %xmm2 ; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll index 006dd3d5ff1788..f55a58048e227a 100644 --- a/llvm/test/CodeGen/X86/sad.ll +++ b/llvm/test/CodeGen/X86/sad.ll @@ -1061,9 +1061,9 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* % ; AVX-LABEL: sad_double_reduction: ; AVX: # %bb.0: # %bb ; AVX-NEXT: vmovdqu (%rdi), %xmm0 +; AVX-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqu (%rdx), %xmm1 ; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 -; AVX-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll index 09d1472c39e98f..e6f9bb597a225d 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -1969,8 +1969,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -2052,8 +2051,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512DQ-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -2079,8 +2077,7 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512DQVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -2238,8 +2235,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpmullw %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpmullw %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero ; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero ; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 @@ -2274,8 +2270,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512BW-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -2311,8 +2306,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] +; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -2608,8 +2602,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-NEXT: vpmullw %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -2647,8 +2640,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512BW-NEXT: vpmullw %xmm0, %xmm1, %xmm0 ; AVX512BW-NEXT: vpmullw %xmm0, %xmm2, %xmm0 -; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -2687,8 +2679,7 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BWVL-NEXT: vpmullw %xmm0, %xmm1, %xmm0 ; AVX512BWVL-NEXT: vpmullw %xmm0, %xmm2, %xmm0 -; AVX512BWVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] +; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll index 50a250ba1adf04..6ffbe095c39baf 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -145,8 +145,7 @@ define <8 x float> @combine_vpermilvar_vperm2f128_zero_8f32(<8 x float> %a0) { define <4 x double> @combine_vperm2f128_vpermilvar_as_vpblendpd(<4 x double> %a0) { ; CHECK-LABEL: combine_vperm2f128_vpermilvar_as_vpblendpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] -; CHECK-NEXT: vmovapd %xmm0, %xmm0 +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; CHECK-NEXT: ret{{[l|q]}} %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> )