diff --git a/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.cpp b/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.cpp index 93ccd5492af764..5fc973223ea3b1 100644 --- a/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.cpp @@ -75,7 +75,10 @@ void UnconventionalAssignOperatorCheck::check( } else { static const char *const Messages[][2] = { {"ReturnType", "operator=() should return '%0&'"}, - {"ArgumentType", "operator=() should take '%0 const&', '%0&&' or '%0'"}, + {"ArgumentType", + getLangOpts().CPlusPlus11 + ? "operator=() should take '%0 const&', '%0&&' or '%0'" + : "operator=() should take '%0 const&' or '%0'"}, {"cv", "operator=() should not be marked '%1'"}}; const auto *Method = Result.Nodes.getNodeAs("method"); diff --git a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp index c1145802aaa418..21d304e1a43830 100644 --- a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp +++ b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp @@ -272,6 +272,11 @@ void RenamerClangTidyCheck::check(const MatchFinder::MatchResult &Result) { } if (const auto *Decl = Result.Nodes.getNodeAs("decl")) { + // Fix using namespace declarations. + if (const auto *UsingNS = dyn_cast(Decl)) + addUsage(NamingCheckFailures, UsingNS->getNominatedNamespaceAsWritten(), + UsingNS->getIdentLocation()); + if (!Decl->getIdentifier() || Decl->getName().empty() || Decl->isImplicit()) return; diff --git a/clang-tools-extra/clangd/Preamble.cpp b/clang-tools-extra/clangd/Preamble.cpp index bd92b8c1bdb43c..fdee71fd22449a 100644 --- a/clang-tools-extra/clangd/Preamble.cpp +++ b/clang-tools-extra/clangd/Preamble.cpp @@ -162,7 +162,7 @@ buildPreamble(PathRef FileName, CompilerInvocation &CI, SerializedDeclsCollector.takeMacros(), std::move(StatCache), SerializedDeclsCollector.takeCanonicalIncludes()); } else { - elog("Could not build a preamble for file {0} version {2}", FileName, + elog("Could not build a preamble for file {0} version {1}", FileName, Inputs.Version); return nullptr; } diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc-unconventional-assign-operator-precxx11.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc-unconventional-assign-operator-precxx11.cpp new file mode 100644 index 00000000000000..7dc939955f37a4 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/misc-unconventional-assign-operator-precxx11.cpp @@ -0,0 +1,6 @@ +// RUN: %check_clang_tidy -std=c++98,c++03 %s misc-unconventional-assign-operator %t + +struct BadArgument { + BadArgument &operator=(BadArgument &); + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: operator=() should take 'BadArgument const&' or 'BadArgument' +}; diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability-identifier-naming.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability-identifier-naming.cpp index c9509434813cdf..9082813910642d 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability-identifier-naming.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability-identifier-naming.cpp @@ -527,3 +527,8 @@ void MyPoiterFunction(int * p_normal_pointer, int * const constant_ptr){ // CHECK-FIXES: {{^}} int * const lc_PointerB = nullptr;{{$}} } +using namespace FOO_NS; +// CHECK-FIXES: {{^}}using namespace foo_ns; + +using namespace FOO_NS::InlineNamespace; +// CHECK-FIXES: {{^}}using namespace foo_ns::inline_namespace; diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake index b338f1b8a08f64..7733e55ab229ca 100644 --- a/clang/cmake/caches/Fuchsia-stage2.cmake +++ b/clang/cmake/caches/Fuchsia-stage2.cmake @@ -195,6 +195,7 @@ set(LLVM_TOOLCHAIN_TOOLS llvm-cxxfilt llvm-dwarfdump llvm-dwp + llvm-gsymutil llvm-lib llvm-nm llvm-objcopy diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst index 66ce6eda9c8c80..ecdf7387183349 100644 --- a/clang/docs/OpenMPSupport.rst +++ b/clang/docs/OpenMPSupport.rst @@ -131,7 +131,7 @@ implementation. +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ | loop extension | clause: if for SIMD directives | :good:`done` | | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| loop extension | inclusive scan extension (matching C++17 PSTL) | :none:`unclaimed` | | +| loop extension | inclusive scan extension (matching C++17 PSTL) | :none:`claimed` | | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ | memory mangagement | memory allocators | :good:`done` | r341687,r357929 | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ @@ -179,6 +179,10 @@ implementation. +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ | device extension | clause: device_type | :good:`done` | | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device extension | clause: extended device | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device extension | clause: uses_allocators clause | :none:`claimed` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ | device extension | clause: in_reduction | :part:`worked on` | r308768 | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ | device extension | omp_get_device_num() | :part:`worked on` | D54342 | diff --git a/clang/include/clang/AST/ASTDumperUtils.h b/clang/include/clang/AST/ASTDumperUtils.h index 55a085449a9b2e..1dce913049ad6e 100644 --- a/clang/include/clang/AST/ASTDumperUtils.h +++ b/clang/include/clang/AST/ASTDumperUtils.h @@ -62,6 +62,8 @@ static const TerminalColor LocationColor = {llvm::raw_ostream::YELLOW, false}; static const TerminalColor ValueKindColor = {llvm::raw_ostream::CYAN, false}; // bitfield/objcproperty/objcsubscript/vectorcomponent static const TerminalColor ObjectKindColor = {llvm::raw_ostream::CYAN, false}; +// contains-errors +static const TerminalColor ErrorsColor = {llvm::raw_ostream::RED, true}; // Null statements static const TerminalColor NullColor = {llvm::raw_ostream::BLUE, false}; diff --git a/clang/include/clang/AST/CXXInheritance.h b/clang/include/clang/AST/CXXInheritance.h index f223c1f2f4f0af..8b1bcb367b3b47 100644 --- a/clang/include/clang/AST/CXXInheritance.h +++ b/clang/include/clang/AST/CXXInheritance.h @@ -119,7 +119,7 @@ class CXXBasePaths { friend class CXXRecordDecl; /// The type from which this search originated. - CXXRecordDecl *Origin = nullptr; + const CXXRecordDecl *Origin = nullptr; /// Paths - The actual set of paths that can be taken from the /// derived class to the same base class. @@ -225,8 +225,8 @@ class CXXBasePaths { /// Retrieve the type from which this base-paths search /// began - CXXRecordDecl *getOrigin() const { return Origin; } - void setOrigin(CXXRecordDecl *Rec) { Origin = Rec; } + const CXXRecordDecl *getOrigin() const { return Origin; } + void setOrigin(const CXXRecordDecl *Rec) { Origin = Rec; } /// Clear the base-paths results. void clear(); diff --git a/clang/include/clang/AST/ComputeDependence.h b/clang/include/clang/AST/ComputeDependence.h index 593ff3a6eb163a..69ccb6c676e540 100644 --- a/clang/include/clang/AST/ComputeDependence.h +++ b/clang/include/clang/AST/ComputeDependence.h @@ -59,6 +59,7 @@ class CXXDeleteExpr; class ArrayTypeTraitExpr; class ExpressionTraitExpr; class CXXNoexceptExpr; +class PackExpansionExpr; class SubstNonTypeTemplateParmExpr; class CoroutineSuspendExpr; class DependentCoawaitExpr; @@ -71,6 +72,7 @@ class LambdaExpr; class CXXUnresolvedConstructExpr; class CXXDependentScopeMemberExpr; class MaterializeTemporaryExpr; +class CXXFoldExpr; class TypeTraitExpr; class ConceptSpecializationExpr; class PredefinedExpr; @@ -134,6 +136,7 @@ ExprDependence computeDependence(CXXDeleteExpr *E); ExprDependence computeDependence(ArrayTypeTraitExpr *E); ExprDependence computeDependence(ExpressionTraitExpr *E); ExprDependence computeDependence(CXXNoexceptExpr *E, CanThrowResult CT); +ExprDependence computeDependence(PackExpansionExpr *E); ExprDependence computeDependence(SubstNonTypeTemplateParmExpr *E); ExprDependence computeDependence(CoroutineSuspendExpr *E); ExprDependence computeDependence(DependentCoawaitExpr *E); @@ -149,6 +152,7 @@ ExprDependence computeDependence(LambdaExpr *E, ExprDependence computeDependence(CXXUnresolvedConstructExpr *E); ExprDependence computeDependence(CXXDependentScopeMemberExpr *E); ExprDependence computeDependence(MaterializeTemporaryExpr *E); +ExprDependence computeDependence(CXXFoldExpr *E); ExprDependence computeDependence(TypeTraitExpr *E); ExprDependence computeDependence(ConceptSpecializationExpr *E, bool ValueDependent); diff --git a/clang/include/clang/AST/DependenceFlags.h b/clang/include/clang/AST/DependenceFlags.h index 21daf0a203ac88..ee6439fc984cf9 100644 --- a/clang/include/clang/AST/DependenceFlags.h +++ b/clang/include/clang/AST/DependenceFlags.h @@ -20,19 +20,23 @@ struct ExprDependenceScope { Type = 4, Value = 8, + // clang extension: this expr contains or references an error, and is + // considered dependent on how that error is resolved. + Error = 16, + None = 0, - All = 15, + All = 31, TypeValue = Type | Value, TypeInstantiation = Type | Instantiation, ValueInstantiation = Value | Instantiation, TypeValueInstantiation = Type | Value | Instantiation, - LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/Value) + LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/Error) }; }; using ExprDependence = ExprDependenceScope::ExprDependence; -static constexpr unsigned ExprDependenceBits = 4; +static constexpr unsigned ExprDependenceBits = 5; struct TypeDependenceScope { enum TypeDependence : uint8_t { @@ -47,6 +51,8 @@ struct TypeDependenceScope { /// Whether this type is a variably-modified type (C99 6.7.5). VariablyModified = 8, + // FIXME: add Error bit. + None = 0, All = 15, @@ -83,11 +89,14 @@ LLVM_COMMON_DEPENDENCE(TemplateArgumentDependence) /// Computes dependencies of a reference with the name having template arguments /// with \p TA dependencies. inline ExprDependence toExprDependence(TemplateArgumentDependence TA) { - auto E = - static_cast(TA & ~TemplateArgumentDependence::Dependent); + auto D = ExprDependence::None; + if (TA & TemplateArgumentDependence::UnexpandedPack) + D |= ExprDependence::UnexpandedPack; + if (TA & TemplateArgumentDependence::Instantiation) + D |= ExprDependence::Instantiation; if (TA & TemplateArgumentDependence::Dependent) - return E | ExprDependence::Type | ExprDependence::Value; - return E; + D |= ExprDependence::Type | ExprDependence::Value; + return D; } inline ExprDependence toExprDependence(TypeDependence TD) { // This hack works because TypeDependence and TemplateArgumentDependence @@ -127,10 +136,13 @@ toTemplateArgumentDependence(TemplateNameDependence D) { } inline TemplateArgumentDependence toTemplateArgumentDependence(ExprDependence ED) { - TemplateArgumentDependence TAD = static_cast( - ED & ~(ExprDependence::Type | ExprDependence::Value)); + TemplateArgumentDependence TAD = TemplateArgumentDependence::None; if (ED & (ExprDependence::Type | ExprDependence::Value)) TAD |= TemplateArgumentDependence::Dependent; + if (ED & ExprDependence::Instantiation) + TAD |= TemplateArgumentDependence::Instantiation; + if (ED & ExprDependence::UnexpandedPack) + TAD |= TemplateArgumentDependence::UnexpandedPack; return TAD; } diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h index 7448281c928905..0a136a7376581d 100644 --- a/clang/include/clang/AST/Expr.h +++ b/clang/include/clang/AST/Expr.h @@ -226,6 +226,12 @@ class Expr : public ValueStmt { return static_cast(getDependence() & ExprDependence::UnexpandedPack); } + /// Whether this expression contains subexpressions which had errors, e.g. a + /// TypoExpr. + bool containsErrors() const { + return static_cast(getDependence() & ExprDependence::Error); + } + /// getExprLoc - Return the preferred location for the arrow when diagnosing /// a problem with a generic expression. SourceLocation getExprLoc() const LLVM_READONLY; @@ -5881,7 +5887,8 @@ class TypoExpr : public Expr { public: TypoExpr(QualType T) : Expr(TypoExprClass, T, VK_LValue, OK_Ordinary) { assert(T->isDependentType() && "TypoExpr given a non-dependent type"); - setDependence(ExprDependence::TypeValueInstantiation); + setDependence(ExprDependence::TypeValueInstantiation | + ExprDependence::Error); } child_range children() { diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h index 33ea3f6346b227..e3404fec02dd5f 100644 --- a/clang/include/clang/AST/ExprCXX.h +++ b/clang/include/clang/AST/ExprCXX.h @@ -4020,7 +4020,7 @@ class PackExpansionExpr : public Expr { EllipsisLoc(EllipsisLoc), NumExpansions(NumExpansions ? *NumExpansions + 1 : 0), Pattern(Pattern) { - setDependence(ExprDependence::TypeValueInstantiation); + setDependence(computeDependence(this)); } PackExpansionExpr(EmptyShell Empty) : Expr(PackExpansionExprClass, Empty) {} @@ -4531,7 +4531,7 @@ class CXXFoldExpr : public Expr { NumExpansions(NumExpansions ? *NumExpansions + 1 : 0), Opcode(Opcode) { SubExprs[0] = LHS; SubExprs[1] = RHS; - setDependence(ExprDependence::TypeValueInstantiation); + setDependence(computeDependence(this)); } CXXFoldExpr(EmptyShell Empty) : Expr(CXXFoldExprClass, Empty) {} diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h index 79f43fc8ab88c3..e82a5f09a32d18 100644 --- a/clang/include/clang/AST/OpenMPClause.h +++ b/clang/include/clang/AST/OpenMPClause.h @@ -4433,6 +4433,12 @@ class OMPDeviceClause : public OMPClause, public OMPClauseWithPreInit { /// Location of '('. SourceLocation LParenLoc; + /// Device clause modifier. + OpenMPDeviceClauseModifier Modifier = OMPC_DEVICE_unknown; + + /// Location of the modifier. + SourceLocation ModifierLoc; + /// Device number. Stmt *Device = nullptr; @@ -4441,20 +4447,30 @@ class OMPDeviceClause : public OMPClause, public OMPClauseWithPreInit { /// \param E Device number. void setDevice(Expr *E) { Device = E; } + /// Sets modifier. + void setModifier(OpenMPDeviceClauseModifier M) { Modifier = M; } + + /// Setst modifier location. + void setModifierLoc(SourceLocation Loc) { ModifierLoc = Loc; } + public: /// Build 'device' clause. /// + /// \param Modifier Clause modifier. /// \param E Expression associated with this clause. /// \param CaptureRegion Innermost OpenMP region where expressions in this /// clause must be captured. /// \param StartLoc Starting location of the clause. + /// \param ModifierLoc Modifier location. /// \param LParenLoc Location of '('. /// \param EndLoc Ending location of the clause. - OMPDeviceClause(Expr *E, Stmt *HelperE, OpenMPDirectiveKind CaptureRegion, - SourceLocation StartLoc, SourceLocation LParenLoc, + OMPDeviceClause(OpenMPDeviceClauseModifier Modifier, Expr *E, Stmt *HelperE, + OpenMPDirectiveKind CaptureRegion, SourceLocation StartLoc, + SourceLocation LParenLoc, SourceLocation ModifierLoc, SourceLocation EndLoc) : OMPClause(OMPC_device, StartLoc, EndLoc), OMPClauseWithPreInit(this), - LParenLoc(LParenLoc), Device(E) { + LParenLoc(LParenLoc), Modifier(Modifier), ModifierLoc(ModifierLoc), + Device(E) { setPreInitStmt(HelperE, CaptureRegion); } @@ -4475,6 +4491,12 @@ class OMPDeviceClause : public OMPClause, public OMPClauseWithPreInit { /// Return device number. Expr *getDevice() const { return cast(Device); } + /// Gets modifier. + OpenMPDeviceClauseModifier getModifier() const { return Modifier; } + + /// Gets modifier location. + SourceLocation getModifierLoc() const { return ModifierLoc; } + child_range children() { return child_range(&Device, &Device + 1); } const_child_range children() const { diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index e72b44fbb31ffa..05c6167dcbb20e 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -1496,7 +1496,7 @@ class alignas(8) Type : public ExtQualsTypeCommonBase { return CachedLocalOrUnnamed; } }; - enum { NumTypeBits = 18 }; + enum { NumTypeBits = 8 + TypeDependenceBits + 6 }; protected: // These classes allow subclasses to somewhat cleanly pack bitfields diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index a0d521d17d0f80..10ed63d7ccae96 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -359,6 +359,8 @@ class TargetArch arches> : TargetSpec { let Arches = arches; } def TargetARM : TargetArch<["arm", "thumb", "armeb", "thumbeb"]>; +def TargetAArch64 : TargetArch<["aarch64"]>; +def TargetAnyArm : TargetArch; def TargetAVR : TargetArch<["avr"]>; def TargetBPF : TargetArch<["bpfel", "bpfeb"]>; def TargetMips32 : TargetArch<["mips", "mipsel"]>; @@ -623,7 +625,7 @@ def Alias : Attr { let Documentation = [Undocumented]; } -def ArmBuiltinAlias : InheritableAttr, TargetSpecificAttr { +def ArmBuiltinAlias : InheritableAttr, TargetSpecificAttr { let Spellings = [Clang<"__clang_arm_builtin_alias">]; let Args = [IdentifierArgument<"BuiltinName">]; let Subjects = SubjectList<[Function], ErrorDiag>; diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index 3c8b0eeb47a5c0..e047054447f333 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -63,7 +63,6 @@ CODEGENOPT(ExperimentalNewPassManager, 1, 0) ///< Enables the new, experimental CODEGENOPT(DebugPassManager, 1, 0) ///< Prints debug information for the new ///< pass manager. CODEGENOPT(DisableRedZone , 1, 0) ///< Set when -mno-red-zone is enabled. -CODEGENOPT(EnableDebugEntryValues, 1, 0) ///< Emit call site parameter dbg info CODEGENOPT(EmitCallSiteInfo, 1, 0) ///< Emit call site info only in the case of ///< '-g' + 'O>0' level. CODEGENOPT(IndirectTlsSegRefs, 1, 0) ///< Set when -mno-tls-direct-seg-refs diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def index e6a4aa1d1f582e..0488dad6706b57 100644 --- a/clang/include/clang/Basic/OpenMPKinds.def +++ b/clang/include/clang/Basic/OpenMPKinds.def @@ -212,6 +212,9 @@ #ifndef OPENMP_DEPOBJ_CLAUSE #define OPENMP_DEPOBJ_CLAUSE(Name) #endif +#ifndef OPENMP_DEVICE_MODIFIER +#define OPENMP_DEVICE_MODIFIER(Name) +#endif // OpenMP clauses. OPENMP_CLAUSE(allocator, OMPAllocatorClause) @@ -366,6 +369,10 @@ OPENMP_SCHEDULE_MODIFIER(monotonic) OPENMP_SCHEDULE_MODIFIER(nonmonotonic) OPENMP_SCHEDULE_MODIFIER(simd) +// Modifiers for 'device' clause. +OPENMP_DEVICE_MODIFIER(ancestor) +OPENMP_DEVICE_MODIFIER(device_num) + // Static attributes for 'defaultmap' clause. OPENMP_DEFAULTMAP_KIND(scalar) OPENMP_DEFAULTMAP_KIND(aggregate) @@ -1091,6 +1098,7 @@ OPENMP_DEPOBJ_CLAUSE(depend) OPENMP_DEPOBJ_CLAUSE(destroy) OPENMP_DEPOBJ_CLAUSE(update) +#undef OPENMP_DEVICE_MODIFIER #undef OPENMP_DEPOBJ_CLAUSE #undef OPENMP_FLUSH_CLAUSE #undef OPENMP_ORDER_KIND diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h index 43196663c45f65..46eeffe999d91d 100644 --- a/clang/include/clang/Basic/OpenMPKinds.h +++ b/clang/include/clang/Basic/OpenMPKinds.h @@ -51,6 +51,13 @@ enum OpenMPScheduleClauseModifier { OMPC_SCHEDULE_MODIFIER_last }; +/// OpenMP modifiers for 'device' clause. +enum OpenMPDeviceClauseModifier { +#define OPENMP_DEVICE_MODIFIER(Name) OMPC_DEVICE_##Name, +#include "clang/Basic/OpenMPKinds.def" + OMPC_DEVICE_unknown, +}; + /// OpenMP attributes for 'depend' clause. enum OpenMPDependClauseKind { #define OPENMP_DEPEND_KIND(Name) \ diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td index ae6ce4837d76b7..45e45899de5f03 100644 --- a/clang/include/clang/Basic/arm_mve.td +++ b/clang/include/clang/Basic/arm_mve.td @@ -243,6 +243,26 @@ let params = T.Signed, pnt = PNT_NType in { defm vqrdmla: VQDMLA; } +multiclass VQDMLAD { + def "": Intrinsic $a, $b, $c, + (u32 exchange), (u32 round), (u32 subtract))>; + def _m: Intrinsic $a, $b, $c, + (u32 exchange), (u32 round), (u32 subtract), $pred)>; +} +let params = T.Signed in { + defm vqdmladhq: VQDMLAD<0, 0, 0>; + defm vqdmladhxq: VQDMLAD<1, 0, 0>; + defm vqdmlsdhq: VQDMLAD<0, 0, 1>; + defm vqdmlsdhxq: VQDMLAD<1, 0, 1>; + defm vqrdmladhq: VQDMLAD<0, 1, 0>; + defm vqrdmladhxq: VQDMLAD<1, 1, 0>; + defm vqrdmlsdhq: VQDMLAD<0, 1, 1>; + defm vqrdmlsdhxq: VQDMLAD<1, 1, 1>; +} + let params = !listconcat(T.Int16, T.Int32) in { let pnt = PNT_None in { def vmvnq_n: Intrinsic; def OP_FMLS_N : Op<(call "vfma", $p0, (op "-", $p1), (dup $p2))>; def OP_MLAL_N : Op<(op "+", $p0, (call "vmull", $p1, (dup $p2)))>; def OP_MLSL_N : Op<(op "-", $p0, (call "vmull", $p1, (dup $p2)))>; -def OP_MUL_LN : Op<(op "*", $p0, (splat $p1, $p2))>; -def OP_MULX_LN : Op<(call "vmulx", $p0, (splat $p1, $p2))>; +def OP_MUL_LN : Op<(op "*", $p0, (call_mangled "splat_lane", $p1, $p2))>; +def OP_MULX_LN : Op<(call "vmulx", $p0, (call_mangled "splat_lane", $p1, $p2))>; def OP_MULL_N : Op<(call "vmull", $p0, (dup $p1))>; -def OP_MULL_LN : Op<(call "vmull", $p0, (splat $p1, $p2))>; -def OP_MULLHi_LN: Op<(call "vmull", (call "vget_high", $p0), (splat $p1, $p2))>; -def OP_MLA_LN : Op<(op "+", $p0, (op "*", $p1, (splat $p2, $p3)))>; -def OP_MLS_LN : Op<(op "-", $p0, (op "*", $p1, (splat $p2, $p3)))>; -def OP_MLAL_LN : Op<(op "+", $p0, (call "vmull", $p1, (splat $p2, $p3)))>; +def OP_MULL_LN : Op<(call "vmull", $p0, (call_mangled "splat_lane", $p1, $p2))>; +def OP_MULLHi_LN: Op<(call "vmull", (call "vget_high", $p0), (call_mangled "splat_lane", $p1, $p2))>; +def OP_MLA_LN : Op<(op "+", $p0, (op "*", $p1, (call_mangled "splat_lane", $p2, $p3)))>; +def OP_MLS_LN : Op<(op "-", $p0, (op "*", $p1, (call_mangled "splat_lane", $p2, $p3)))>; +def OP_MLAL_LN : Op<(op "+", $p0, (call "vmull", $p1, (call_mangled "splat_lane", $p2, $p3)))>; def OP_MLALHi_LN: Op<(op "+", $p0, (call "vmull", (call "vget_high", $p1), - (splat $p2, $p3)))>; -def OP_MLSL_LN : Op<(op "-", $p0, (call "vmull", $p1, (splat $p2, $p3)))>; + (call_mangled "splat_lane", $p2, $p3)))>; +def OP_MLSL_LN : Op<(op "-", $p0, (call "vmull", $p1, (call_mangled "splat_lane", $p2, $p3)))>; def OP_MLSLHi_LN : Op<(op "-", $p0, (call "vmull", (call "vget_high", $p1), - (splat $p2, $p3)))>; + (call_mangled "splat_lane", $p2, $p3)))>; def OP_QDMULL_N : Op<(call "vqdmull", $p0, (dup $p1))>; -def OP_QDMULL_LN : Op<(call "vqdmull", $p0, (splat $p1, $p2))>; +def OP_QDMULL_LN : Op<(call "vqdmull", $p0, (call_mangled "splat_lane", $p1, $p2))>; def OP_QDMULLHi_LN : Op<(call "vqdmull", (call "vget_high", $p0), - (splat $p1, $p2))>; + (call_mangled "splat_lane", $p1, $p2))>; def OP_QDMLAL_N : Op<(call "vqdmlal", $p0, $p1, (dup $p2))>; -def OP_QDMLAL_LN : Op<(call "vqdmlal", $p0, $p1, (splat $p2, $p3))>; +def OP_QDMLAL_LN : Op<(call "vqdmlal", $p0, $p1, (call_mangled "splat_lane", $p2, $p3))>; def OP_QDMLALHi_LN : Op<(call "vqdmlal", $p0, (call "vget_high", $p1), - (splat $p2, $p3))>; + (call_mangled "splat_lane", $p2, $p3))>; def OP_QDMLSL_N : Op<(call "vqdmlsl", $p0, $p1, (dup $p2))>; -def OP_QDMLSL_LN : Op<(call "vqdmlsl", $p0, $p1, (splat $p2, $p3))>; +def OP_QDMLSL_LN : Op<(call "vqdmlsl", $p0, $p1, (call_mangled "splat_lane", $p2, $p3))>; def OP_QDMLSLHi_LN : Op<(call "vqdmlsl", $p0, (call "vget_high", $p1), - (splat $p2, $p3))>; + (call_mangled "splat_lane", $p2, $p3))>; def OP_QDMULH_N : Op<(call "vqdmulh", $p0, (dup $p1))>; -def OP_QDMULH_LN : Op<(call "vqdmulh", $p0, (splat $p1, $p2))>; -def OP_QRDMULH_LN : Op<(call "vqrdmulh", $p0, (splat $p1, $p2))>; +def OP_QDMULH_LN : Op<(call "vqdmulh", $p0, (call_mangled "splat_lane", $p1, $p2))>; +def OP_QRDMULH_LN : Op<(call "vqrdmulh", $p0, (call_mangled "splat_lane", $p1, $p2))>; def OP_QRDMULH_N : Op<(call "vqrdmulh", $p0, (dup $p1))>; def OP_QRDMLAH : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1, $p2))>; def OP_QRDMLSH : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1, $p2))>; -def OP_QRDMLAH_LN : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1, (splat $p2, $p3)))>; -def OP_QRDMLSH_LN : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1, (splat $p2, $p3)))>; +def OP_QRDMLAH_LN : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1, (call_mangled "splat_lane", $p2, $p3)))>; +def OP_QRDMLSH_LN : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1, (call_mangled "splat_lane", $p2, $p3)))>; def OP_FMS_LN : Op<(call "vfma_lane", $p0, (op "-", $p1), $p2, $p3)>; def OP_FMS_LNQ : Op<(call "vfma_laneq", $p0, (op "-", $p1), $p2, $p3)>; def OP_TRN1 : Op<(shuffle $p0, $p1, (interleave (decimate mask0, 2), @@ -115,7 +115,7 @@ def OP_HI : Op<(shuffle $p0, $p0, (highhalf mask0))>; def OP_LO : Op<(shuffle $p0, $p0, (lowhalf mask0))>; def OP_CONC : Op<(shuffle $p0, $p1, (add mask0, mask1))>; def OP_DUP : Op<(dup $p0)>; -def OP_DUP_LN : Op<(splat $p0, $p1)>; +def OP_DUP_LN : Op<(call_mangled "splat_lane", $p0, $p1)>; def OP_SEL : Op<(cast "R", (op "|", (op "&", $p0, (cast $p0, $p1)), (op "&", (op "~", $p0), (cast $p0, $p2))))>; @@ -207,10 +207,10 @@ def OP_SCALAR_HALF_SET_LNQ : Op<(bitcast "float16x8_t", def OP_DOT_LN : Op<(call "vdot", $p0, $p1, - (bitcast $p1, (splat(bitcast "uint32x2_t", $p2), $p3)))>; + (bitcast $p1, (call_mangled "splat_lane", (bitcast "32", $p2), $p3)))>; def OP_DOT_LNQ : Op<(call "vdot", $p0, $p1, - (bitcast $p1, (splat(bitcast "uint32x4_t", $p2), $p3)))>; + (bitcast $p1, (call_mangled "splat_lane", (bitcast "32", $p2), $p3)))>; def OP_FMLAL_LN : Op<(call "vfmlal_low", $p0, $p1, (dup_typed $p1, (call "vget_lane", $p2, $p3)))>; @@ -222,7 +222,19 @@ def OP_FMLSL_LN_Hi : Op<(call "vfmlsl_high", $p0, $p1, (dup_typed $p1, (call "vget_lane", $p2, $p3)))>; //===----------------------------------------------------------------------===// -// Instructions +// Auxiliary Instructions +//===----------------------------------------------------------------------===// + +// Splat operation - performs a range-checked splat over a vector +def SPLAT : WInst<"splat_lane", ".(!q)I", + "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl">; +def SPLATQ : WInst<"splat_laneq", ".(!Q)I", + "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl"> { + let isLaneQ = 1; +} + +//===----------------------------------------------------------------------===// +// Intrinsics //===----------------------------------------------------------------------===// //////////////////////////////////////////////////////////////////////////////// @@ -535,8 +547,8 @@ def VQDMULH_LANE : SOpInst<"vqdmulh_lane", "..qI", "siQsQi", OP_QDMULH_LN>; def VQRDMULH_LANE : SOpInst<"vqrdmulh_lane", "..qI", "siQsQi", OP_QRDMULH_LN>; } let ArchGuard = "defined(__aarch64__)" in { -def A64_VQDMULH_LANE : SInst<"vqdmulh_lane", "..qI", "siQsQi">; -def A64_VQRDMULH_LANE : SInst<"vqrdmulh_lane", "..qI", "siQsQi">; +def A64_VQDMULH_LANE : SInst<"vqdmulh_lane", "..(!q)I", "siQsQi">; +def A64_VQRDMULH_LANE : SInst<"vqrdmulh_lane", "..(!q)I", "siQsQi">; } let ArchGuard = "defined(__ARM_FEATURE_QRDMX)" in { @@ -881,16 +893,22 @@ def COPY_LANE : IOpInst<"vcopy_lane", "..I.I", def COPYQ_LANE : IOpInst<"vcopy_lane", "..IqI", "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPl", OP_COPY_LN>; def COPY_LANEQ : IOpInst<"vcopy_laneq", "..IQI", - "csilPcPsPlUcUsUiUlfd", OP_COPY_LN>; + "csilPcPsPlUcUsUiUlfd", OP_COPY_LN> { + let isLaneQ = 1; +} def COPYQ_LANEQ : IOpInst<"vcopy_laneq", "..I.I", - "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPl", OP_COPY_LN>; + "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPl", OP_COPY_LN> { + let isLaneQ = 1; +} //////////////////////////////////////////////////////////////////////////////// // Set all lanes to same value def VDUP_LANE1: WOpInst<"vdup_lane", ".qI", "hdQhQdPlQPl", OP_DUP_LN>; def VDUP_LANE2: WOpInst<"vdup_laneq", ".QI", "csilUcUsUiUlPcPshfdQcQsQiQlQPcQPsQUcQUsQUiQUlQhQfQdPlQPl", - OP_DUP_LN>; + OP_DUP_LN> { + let isLaneQ = 1; +} def DUP_N : WOpInst<"vdup_n", ".1", "dQdPlQPl", OP_DUP>; def MOV_N : WOpInst<"vmov_n", ".1", "dQdPlQPl", OP_DUP>; @@ -906,38 +924,60 @@ def CREATE : NoTestOpInst<"vcreate", ".(IU>)", "dPl", OP_CAST> { //////////////////////////////////////////////////////////////////////////////// def VMLA_LANEQ : IOpInst<"vmla_laneq", "...QI", - "siUsUifQsQiQUsQUiQf", OP_MLA_LN>; + "siUsUifQsQiQUsQUiQf", OP_MLA_LN> { + let isLaneQ = 1; +} def VMLS_LANEQ : IOpInst<"vmls_laneq", "...QI", - "siUsUifQsQiQUsQUiQf", OP_MLS_LN>; + "siUsUifQsQiQUsQUiQf", OP_MLS_LN> { + let isLaneQ = 1; +} def VFMA_LANE : IInst<"vfma_lane", "...qI", "fdQfQd">; def VFMA_LANEQ : IInst<"vfma_laneq", "...QI", "fdQfQd"> { let isLaneQ = 1; } def VFMS_LANE : IOpInst<"vfms_lane", "...qI", "fdQfQd", OP_FMS_LN>; -def VFMS_LANEQ : IOpInst<"vfms_laneq", "...QI", "fdQfQd", OP_FMS_LNQ>; +def VFMS_LANEQ : IOpInst<"vfms_laneq", "...QI", "fdQfQd", OP_FMS_LNQ> { + let isLaneQ = 1; +} -def VMLAL_LANEQ : SOpInst<"vmlal_laneq", "(>Q)(>Q).QI", "siUsUi", OP_MLAL_LN>; +def VMLAL_LANEQ : SOpInst<"vmlal_laneq", "(>Q)(>Q).QI", "siUsUi", OP_MLAL_LN> { + let isLaneQ = 1; +} def VMLAL_HIGH_LANE : SOpInst<"vmlal_high_lane", "(>Q)(>Q)Q.I", "siUsUi", OP_MLALHi_LN>; def VMLAL_HIGH_LANEQ : SOpInst<"vmlal_high_laneq", "(>Q)(>Q)QQI", "siUsUi", - OP_MLALHi_LN>; -def VMLSL_LANEQ : SOpInst<"vmlsl_laneq", "(>Q)(>Q).QI", "siUsUi", OP_MLSL_LN>; + OP_MLALHi_LN> { + let isLaneQ = 1; +} +def VMLSL_LANEQ : SOpInst<"vmlsl_laneq", "(>Q)(>Q).QI", "siUsUi", OP_MLSL_LN> { + let isLaneQ = 1; +} def VMLSL_HIGH_LANE : SOpInst<"vmlsl_high_lane", "(>Q)(>Q)Q.I", "siUsUi", OP_MLSLHi_LN>; def VMLSL_HIGH_LANEQ : SOpInst<"vmlsl_high_laneq", "(>Q)(>Q)QQI", "siUsUi", - OP_MLSLHi_LN>; + OP_MLSLHi_LN> { + let isLaneQ = 1; +} -def VQDMLAL_LANEQ : SOpInst<"vqdmlal_laneq", "(>Q)(>Q).QI", "si", OP_QDMLAL_LN>; +def VQDMLAL_LANEQ : SOpInst<"vqdmlal_laneq", "(>Q)(>Q).QI", "si", OP_QDMLAL_LN> { + let isLaneQ = 1; +} def VQDMLAL_HIGH_LANE : SOpInst<"vqdmlal_high_lane", "(>Q)(>Q)Q.I", "si", OP_QDMLALHi_LN>; def VQDMLAL_HIGH_LANEQ : SOpInst<"vqdmlal_high_laneq", "(>Q)(>Q)QQI", "si", - OP_QDMLALHi_LN>; -def VQDMLSL_LANEQ : SOpInst<"vqdmlsl_laneq", "(>Q)(>Q).QI", "si", OP_QDMLSL_LN>; + OP_QDMLALHi_LN> { + let isLaneQ = 1; +} +def VQDMLSL_LANEQ : SOpInst<"vqdmlsl_laneq", "(>Q)(>Q).QI", "si", OP_QDMLSL_LN> { + let isLaneQ = 1; +} def VQDMLSL_HIGH_LANE : SOpInst<"vqdmlsl_high_lane", "(>Q)(>Q)Q.I", "si", OP_QDMLSLHi_LN>; def VQDMLSL_HIGH_LANEQ : SOpInst<"vqdmlsl_high_laneq", "(>Q)(>Q)QQI", "si", - OP_QDMLSLHi_LN>; + OP_QDMLSLHi_LN> { + let isLaneQ = 1; +} // Newly add double parameter for vmul_lane in aarch64 // Note: d type is handled by SCALAR_VMUL_LANE @@ -945,32 +985,48 @@ def VMUL_LANE_A64 : IOpInst<"vmul_lane", "..qI", "Qd", OP_MUL_LN>; // Note: d type is handled by SCALAR_VMUL_LANEQ def VMUL_LANEQ : IOpInst<"vmul_laneq", "..QI", - "sifUsUiQsQiQUsQUiQfQd", OP_MUL_LN>; -def VMULL_LANEQ : SOpInst<"vmull_laneq", "(>Q).QI", "siUsUi", OP_MULL_LN>; + "sifUsUiQsQiQUsQUiQfQd", OP_MUL_LN> { + let isLaneQ = 1; +} +def VMULL_LANEQ : SOpInst<"vmull_laneq", "(>Q).QI", "siUsUi", OP_MULL_LN> { + let isLaneQ = 1; +} def VMULL_HIGH_LANE : SOpInst<"vmull_high_lane", "(>Q)Q.I", "siUsUi", OP_MULLHi_LN>; def VMULL_HIGH_LANEQ : SOpInst<"vmull_high_laneq", "(>Q)QQI", "siUsUi", - OP_MULLHi_LN>; + OP_MULLHi_LN> { + let isLaneQ = 1; +} -def VQDMULL_LANEQ : SOpInst<"vqdmull_laneq", "(>Q).QI", "si", OP_QDMULL_LN>; +def VQDMULL_LANEQ : SOpInst<"vqdmull_laneq", "(>Q).QI", "si", OP_QDMULL_LN> { + let isLaneQ = 1; +} def VQDMULL_HIGH_LANE : SOpInst<"vqdmull_high_lane", "(>Q)Q.I", "si", OP_QDMULLHi_LN>; def VQDMULL_HIGH_LANEQ : SOpInst<"vqdmull_high_laneq", "(>Q)QQI", "si", - OP_QDMULLHi_LN>; + OP_QDMULLHi_LN> { + let isLaneQ = 1; +} let isLaneQ = 1 in { def VQDMULH_LANEQ : SInst<"vqdmulh_laneq", "..QI", "siQsQi">; def VQRDMULH_LANEQ : SInst<"vqrdmulh_laneq", "..QI", "siQsQi">; } let ArchGuard = "defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)" in { -def VQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "...QI", "siQsQi", OP_QRDMLAH_LN>; -def VQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "...QI", "siQsQi", OP_QRDMLSH_LN>; +def VQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "...QI", "siQsQi", OP_QRDMLAH_LN> { + let isLaneQ = 1; +} +def VQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "...QI", "siQsQi", OP_QRDMLSH_LN> { + let isLaneQ = 1; +} } // Note: d type implemented by SCALAR_VMULX_LANE def VMULX_LANE : IOpInst<"vmulx_lane", "..qI", "fQfQd", OP_MULX_LN>; // Note: d type is implemented by SCALAR_VMULX_LANEQ -def VMULX_LANEQ : IOpInst<"vmulx_laneq", "..QI", "fQfQd", OP_MULX_LN>; +def VMULX_LANEQ : IOpInst<"vmulx_laneq", "..QI", "fQfQd", OP_MULX_LN> { + let isLaneQ = 1; +} //////////////////////////////////////////////////////////////////////////////// // Across vectors class @@ -1380,11 +1436,15 @@ def SCALAR_UQXTN : SInst<"vqmovn", "(1<)1", "SUsSUiSUl">; // Scalar Floating Point multiply (scalar, by element) def SCALAR_FMUL_LANE : IOpInst<"vmul_lane", "11.I", "SfSd", OP_SCALAR_MUL_LN>; -def SCALAR_FMUL_LANEQ : IOpInst<"vmul_laneq", "11QI", "SfSd", OP_SCALAR_MUL_LN>; +def SCALAR_FMUL_LANEQ : IOpInst<"vmul_laneq", "11QI", "SfSd", OP_SCALAR_MUL_LN> { + let isLaneQ = 1; +} // Scalar Floating Point multiply extended (scalar, by element) def SCALAR_FMULX_LANE : IOpInst<"vmulx_lane", "11.I", "SfSd", OP_SCALAR_MULX_LN>; -def SCALAR_FMULX_LANEQ : IOpInst<"vmulx_laneq", "11QI", "SfSd", OP_SCALAR_MULX_LN>; +def SCALAR_FMULX_LANEQ : IOpInst<"vmulx_laneq", "11QI", "SfSd", OP_SCALAR_MULX_LN> { + let isLaneQ = 1; +} def SCALAR_VMUL_N : IInst<"vmul_n", "..1", "d">; @@ -1400,48 +1460,70 @@ def SCALAR_VMUL_LANEQ : IInst<"vmul_laneq", "..QI", "d"> { def SCALAR_VMULX_LANE : IOpInst<"vmulx_lane", "..qI", "d", OP_SCALAR_VMULX_LN>; // VMULX_LANEQ d type implemented using scalar vmulx_laneq -def SCALAR_VMULX_LANEQ : IOpInst<"vmulx_laneq", "..QI", "d", OP_SCALAR_VMULX_LNQ>; +def SCALAR_VMULX_LANEQ : IOpInst<"vmulx_laneq", "..QI", "d", OP_SCALAR_VMULX_LNQ> { + let isLaneQ = 1; +} // Scalar Floating Point fused multiply-add (scalar, by element) def SCALAR_FMLA_LANE : IInst<"vfma_lane", "111.I", "SfSd">; -def SCALAR_FMLA_LANEQ : IInst<"vfma_laneq", "111QI", "SfSd">; +def SCALAR_FMLA_LANEQ : IInst<"vfma_laneq", "111QI", "SfSd"> { + let isLaneQ = 1; +} // Scalar Floating Point fused multiply-subtract (scalar, by element) def SCALAR_FMLS_LANE : IOpInst<"vfms_lane", "111.I", "SfSd", OP_FMS_LN>; -def SCALAR_FMLS_LANEQ : IOpInst<"vfms_laneq", "111QI", "SfSd", OP_FMS_LNQ>; +def SCALAR_FMLS_LANEQ : IOpInst<"vfms_laneq", "111QI", "SfSd", OP_FMS_LNQ> { + let isLaneQ = 1; +} // Signed Saturating Doubling Multiply Long (scalar by element) def SCALAR_SQDMULL_LANE : SOpInst<"vqdmull_lane", "(1>)1.I", "SsSi", OP_SCALAR_QDMULL_LN>; -def SCALAR_SQDMULL_LANEQ : SOpInst<"vqdmull_laneq", "(1>)1QI", "SsSi", OP_SCALAR_QDMULL_LN>; +def SCALAR_SQDMULL_LANEQ : SOpInst<"vqdmull_laneq", "(1>)1QI", "SsSi", OP_SCALAR_QDMULL_LN> { + let isLaneQ = 1; +} // Signed Saturating Doubling Multiply-Add Long (scalar by element) def SCALAR_SQDMLAL_LANE : SInst<"vqdmlal_lane", "(1>)(1>)1.I", "SsSi">; -def SCALAR_SQDMLAL_LANEQ : SInst<"vqdmlal_laneq", "(1>)(1>)1QI", "SsSi">; +def SCALAR_SQDMLAL_LANEQ : SInst<"vqdmlal_laneq", "(1>)(1>)1QI", "SsSi"> { + let isLaneQ = 1; +} // Signed Saturating Doubling Multiply-Subtract Long (scalar by element) def SCALAR_SQDMLS_LANE : SInst<"vqdmlsl_lane", "(1>)(1>)1.I", "SsSi">; -def SCALAR_SQDMLS_LANEQ : SInst<"vqdmlsl_laneq", "(1>)(1>)1QI", "SsSi">; +def SCALAR_SQDMLS_LANEQ : SInst<"vqdmlsl_laneq", "(1>)(1>)1QI", "SsSi"> { + let isLaneQ = 1; +} // Scalar Integer Saturating Doubling Multiply Half High (scalar by element) def SCALAR_SQDMULH_LANE : SOpInst<"vqdmulh_lane", "11.I", "SsSi", OP_SCALAR_QDMULH_LN>; -def SCALAR_SQDMULH_LANEQ : SOpInst<"vqdmulh_laneq", "11QI", "SsSi", OP_SCALAR_QDMULH_LN>; +def SCALAR_SQDMULH_LANEQ : SOpInst<"vqdmulh_laneq", "11QI", "SsSi", OP_SCALAR_QDMULH_LN> { + let isLaneQ = 1; +} // Scalar Integer Saturating Rounding Doubling Multiply Half High def SCALAR_SQRDMULH_LANE : SOpInst<"vqrdmulh_lane", "11.I", "SsSi", OP_SCALAR_QRDMULH_LN>; -def SCALAR_SQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "11QI", "SsSi", OP_SCALAR_QRDMULH_LN>; +def SCALAR_SQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "11QI", "SsSi", OP_SCALAR_QRDMULH_LN> { + let isLaneQ = 1; +} let ArchGuard = "defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)" in { // Signed Saturating Rounding Doubling Multiply Accumulate Returning High Half def SCALAR_SQRDMLAH_LANE : SOpInst<"vqrdmlah_lane", "111.I", "SsSi", OP_SCALAR_QRDMLAH_LN>; -def SCALAR_SQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLAH_LN>; +def SCALAR_SQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLAH_LN> { + let isLaneQ = 1; +} // Signed Saturating Rounding Doubling Multiply Subtract Returning High Half def SCALAR_SQRDMLSH_LANE : SOpInst<"vqrdmlsh_lane", "111.I", "SsSi", OP_SCALAR_QRDMLSH_LN>; -def SCALAR_SQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLSH_LN>; +def SCALAR_SQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLSH_LN> { + let isLaneQ = 1; +} } def SCALAR_VDUP_LANE : IInst<"vdup_lane", "1.I", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs">; -def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "1QI", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs">; +def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "1QI", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs"> { + let isLaneQ = 1; +} } // ARMv8.2-A FP16 vector intrinsics for A32/A64. @@ -1605,36 +1687,52 @@ let ArchGuard = "defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(__aarc // FMA lane def VFMA_LANEH : IInst<"vfma_lane", "...qI", "hQh">; - def VFMA_LANEQH : IInst<"vfma_laneq", "...QI", "hQh">; + def VFMA_LANEQH : IInst<"vfma_laneq", "...QI", "hQh"> { + let isLaneQ = 1; + } // FMA lane with scalar argument def FMLA_NH : SOpInst<"vfma_n", "...1", "hQh", OP_FMLA_N>; // Scalar floating point fused multiply-add (scalar, by element) def SCALAR_FMLA_LANEH : IInst<"vfma_lane", "111.I", "Sh">; - def SCALAR_FMLA_LANEQH : IInst<"vfma_laneq", "111QI", "Sh">; + def SCALAR_FMLA_LANEQH : IInst<"vfma_laneq", "111QI", "Sh"> { + let isLaneQ = 1; + } // FMS lane def VFMS_LANEH : IOpInst<"vfms_lane", "...qI", "hQh", OP_FMS_LN>; - def VFMS_LANEQH : IOpInst<"vfms_laneq", "...QI", "hQh", OP_FMS_LNQ>; + def VFMS_LANEQH : IOpInst<"vfms_laneq", "...QI", "hQh", OP_FMS_LNQ> { + let isLaneQ = 1; + } // FMS lane with scalar argument def FMLS_NH : SOpInst<"vfms_n", "...1", "hQh", OP_FMLS_N>; // Scalar floating foint fused multiply-subtract (scalar, by element) def SCALAR_FMLS_LANEH : IOpInst<"vfms_lane", "111.I", "Sh", OP_FMS_LN>; - def SCALAR_FMLS_LANEQH : IOpInst<"vfms_laneq", "111QI", "Sh", OP_FMS_LNQ>; + def SCALAR_FMLS_LANEQH : IOpInst<"vfms_laneq", "111QI", "Sh", OP_FMS_LNQ> { + let isLaneQ = 1; + } // Mul lane - def VMUL_LANEQH : IOpInst<"vmul_laneq", "..QI", "hQh", OP_MUL_LN>; + def VMUL_LANEQH : IOpInst<"vmul_laneq", "..QI", "hQh", OP_MUL_LN> { + let isLaneQ = 1; + } // Scalar floating point multiply (scalar, by element) def SCALAR_FMUL_LANEH : IOpInst<"vmul_lane", "11.I", "Sh", OP_SCALAR_MUL_LN>; - def SCALAR_FMUL_LANEQH : IOpInst<"vmul_laneq", "11QI", "Sh", OP_SCALAR_MUL_LN>; + def SCALAR_FMUL_LANEQH : IOpInst<"vmul_laneq", "11QI", "Sh", OP_SCALAR_MUL_LN> { + let isLaneQ = 1; + } // Mulx lane def VMULX_LANEH : IOpInst<"vmulx_lane", "..qI", "hQh", OP_MULX_LN>; - def VMULX_LANEQH : IOpInst<"vmulx_laneq", "..QI", "hQh", OP_MULX_LN>; + def VMULX_LANEQH : IOpInst<"vmulx_laneq", "..QI", "hQh", OP_MULX_LN> { + let isLaneQ = 1; + } def VMULX_NH : IOpInst<"vmulx_n", "..1", "hQh", OP_MULX_N>; // Scalar floating point mulx (scalar, by element) def SCALAR_FMULX_LANEH : IInst<"vmulx_lane", "11.I", "Sh">; - def SCALAR_FMULX_LANEQH : IInst<"vmulx_laneq", "11QI", "Sh">; + def SCALAR_FMULX_LANEQH : IInst<"vmulx_laneq", "11QI", "Sh"> { + let isLaneQ = 1; + } // ARMv8.2-A FP16 reduction vector intrinsics. def VMAXVH : SInst<"vmaxv", "1.", "hQh">; @@ -1651,7 +1749,9 @@ let ArchGuard = "defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(__aarc def VUZP2H : SOpInst<"vuzp2", "...", "hQh", OP_UZP2>; def SCALAR_VDUP_LANEH : IInst<"vdup_lane", "1.I", "Sh">; - def SCALAR_VDUP_LANEQH : IInst<"vdup_laneq", "1QI", "Sh">; + def SCALAR_VDUP_LANEQH : IInst<"vdup_laneq", "1QI", "Sh"> { + let isLaneQ = 1; + } } // v8.2-A dot product instructions. @@ -1661,7 +1761,9 @@ let ArchGuard = "defined(__ARM_FEATURE_DOTPROD)" in { } let ArchGuard = "defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__)" in { // Variants indexing into a 128-bit vector are A64 only. - def UDOT_LANEQ : SOpInst<"vdot_laneq", "..(<<)(<; + def UDOT_LANEQ : SOpInst<"vdot_laneq", "..(<<)(< { + let isLaneQ = 1; + } } // v8.2-A FP16 fused multiply-add long instructions. @@ -1676,10 +1778,18 @@ let ArchGuard = "defined(__ARM_FEATURE_FP16FML) && defined(__aarch64__)" in { def VFMLAL_LANE_HIGH : SOpInst<"vfmlal_lane_high", "(F>)(F>)F(Fq)I", "hQh", OP_FMLAL_LN_Hi>; def VFMLSL_LANE_HIGH : SOpInst<"vfmlsl_lane_high", "(F>)(F>)F(Fq)I", "hQh", OP_FMLSL_LN_Hi>; - def VFMLAL_LANEQ_LOW : SOpInst<"vfmlal_laneq_low", "(F>)(F>)F(FQ)I", "hQh", OP_FMLAL_LN>; - def VFMLSL_LANEQ_LOW : SOpInst<"vfmlsl_laneq_low", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN>; - def VFMLAL_LANEQ_HIGH : SOpInst<"vfmlal_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLAL_LN_Hi>; - def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN_Hi>; + def VFMLAL_LANEQ_LOW : SOpInst<"vfmlal_laneq_low", "(F>)(F>)F(FQ)I", "hQh", OP_FMLAL_LN> { + let isLaneQ = 1; + } + def VFMLSL_LANEQ_LOW : SOpInst<"vfmlsl_laneq_low", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN> { + let isLaneQ = 1; + } + def VFMLAL_LANEQ_HIGH : SOpInst<"vfmlal_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLAL_LN_Hi> { + let isLaneQ = 1; + } + def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN_Hi> { + let isLaneQ = 1; + } } // v8.3-A Vector complex addition intrinsics diff --git a/clang/include/clang/Basic/arm_neon_incl.td b/clang/include/clang/Basic/arm_neon_incl.td index 28b00d162a00d1..d817e7acb9135d 100644 --- a/clang/include/clang/Basic/arm_neon_incl.td +++ b/clang/include/clang/Basic/arm_neon_incl.td @@ -60,6 +60,15 @@ def op; // example: (call "vget_high", $p0) -> "vgetq_high_s16(__p0)" // (assuming $p0 has type int16x8_t). def call; +// call_mangled - Invoke another intrinsic matching the mangled name variation +// of the caller's base type. If there is no intrinsic defined +// that has the variation and takes the given types, an error +// is generated at tblgen time. +// example: (call_mangled "vfma_lane", $p0, $p1) -> "vfma_lane(__p0, __p1)" +// (assuming non-LaneQ caller) +// (call_mangled "vfma_lane", $p0, $p1) -> "vfma_laneq(__p0, __p1)" +// (assuming LaneQ caller) +def call_mangled; // cast - Perform a cast to a different type. This gets emitted as a static // C-style cast. For a pure reinterpret cast (T x = *(T*)&y), use // "bitcast". @@ -79,6 +88,7 @@ def call; // - "D" - Double the number of lanes in the type. // - "8" - Convert type to an equivalent vector of 8-bit signed // integers. +// - "32" - Convert type to an equivalent vector of 32-bit integers. // example: (cast "R", "U", $p0) -> "(uint32x4_t)__p0" (assuming the return // value is of type "int32x4_t". // (cast $p0, "D", "8", $p1) -> "(int8x16_t)__p1" (assuming __p0 @@ -100,12 +110,6 @@ def dup; // example: (dup_typed $p1, $p2) -> "(float16x4_t) {__p2, __p2, __p2, __p2}" // (assuming __p1 is float16x4_t, and __p2 is a compatible scalar). def dup_typed; -// splat - Take a vector and a lane index, and return a vector of the same type -// containing repeated instances of the source vector at the lane index. -// example: (splat $p0, $p1) -> -// "__builtin_shufflevector(__p0, __p0, __p1, __p1, __p1, __p1)" -// (assuming __p0 has four elements). -def splat; // save_temp - Create a temporary (local) variable. The variable takes a name // based on the zero'th parameter and can be referenced using // using that name in subsequent DAGs in the same diff --git a/clang/include/clang/Driver/CC1Options.td b/clang/include/clang/Driver/CC1Options.td index b7a2826d8fcb4c..cc30893703dfb9 100644 --- a/clang/include/clang/Driver/CC1Options.td +++ b/clang/include/clang/Driver/CC1Options.td @@ -388,8 +388,6 @@ def flto_visibility_public_std: def flto_unit: Flag<["-"], "flto-unit">, HelpText<"Emit IR to support LTO unit features (CFI, whole program vtable opt)">; def fno_lto_unit: Flag<["-"], "fno-lto-unit">; -def femit_debug_entry_values : Flag<["-"], "femit-debug-entry-values">, - HelpText<"Enables debug info about call site parameter's entry values">; def fdebug_pass_manager : Flag<["-"], "fdebug-pass-manager">, HelpText<"Prints debug information for the new pass manager">; def fno_debug_pass_manager : Flag<["-"], "fno-debug-pass-manager">, diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index 85844e2edb070d..9a22bac75e62c6 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -1814,7 +1814,9 @@ class Parser : public CodeCompletionHandler { bool EnteringContext, IdentifierInfo &II, CXXScopeSpec &SS); - bool ParseOptionalCXXScopeSpecifier(CXXScopeSpec &SS, ParsedType ObjectType, + bool ParseOptionalCXXScopeSpecifier(CXXScopeSpec &SS, + ParsedType ObjectType, + bool ObjectHasErrors, bool EnteringContext, bool *MayBePseudoDestructor = nullptr, bool IsTypename = false, @@ -2908,11 +2910,12 @@ class Parser : public CodeCompletionHandler { AccessSpecifier getAccessSpecifierIfPresent() const; bool ParseUnqualifiedIdTemplateId(CXXScopeSpec &SS, + ParsedType ObjectType, + bool ObjectHadErrors, SourceLocation TemplateKWLoc, IdentifierInfo *Name, SourceLocation NameLoc, bool EnteringContext, - ParsedType ObjectType, UnqualifiedId &Id, bool AssumeTemplateId); bool ParseUnqualifiedIdOperator(CXXScopeSpec &SS, bool EnteringContext, @@ -3029,11 +3032,13 @@ class Parser : public CodeCompletionHandler { /// Parses clause with a single expression and an additional argument /// of a kind \a Kind. /// + /// \param DKind Directive kind. /// \param Kind Kind of current clause. /// \param ParseOnly true to skip the clause's semantic actions and return /// nullptr. /// - OMPClause *ParseOpenMPSingleExprWithArgClause(OpenMPClauseKind Kind, + OMPClause *ParseOpenMPSingleExprWithArgClause(OpenMPDirectiveKind DKind, + OpenMPClauseKind Kind, bool ParseOnly); /// Parses clause without any additional arguments. /// @@ -3079,13 +3084,12 @@ class Parser : public CodeCompletionHandler { bool ParseOpenMPVarList(OpenMPDirectiveKind DKind, OpenMPClauseKind Kind, SmallVectorImpl &Vars, OpenMPVarListDataTy &Data); - bool ParseUnqualifiedId(CXXScopeSpec &SS, bool EnteringContext, - bool AllowDestructorName, - bool AllowConstructorName, + bool ParseUnqualifiedId(CXXScopeSpec &SS, ParsedType ObjectType, + bool ObjectHadErrors, bool EnteringContext, + bool AllowDestructorName, bool AllowConstructorName, bool AllowDeductionGuide, - ParsedType ObjectType, - SourceLocation *TemplateKWLoc, - UnqualifiedId &Result); + SourceLocation *TemplateKWLoc, UnqualifiedId &Result); + /// Parses the mapper modifier in map, to, and from clauses. bool parseMapperModifier(OpenMPVarListDataTy &Data); /// Parses map-type-modifiers in map clause. diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index f29e4f3c227c21..28f6705b307f86 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -10576,8 +10576,10 @@ class Sema final { SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc); /// Called on well-formed 'device' clause. - OMPClause *ActOnOpenMPDeviceClause(Expr *Device, SourceLocation StartLoc, + OMPClause *ActOnOpenMPDeviceClause(OpenMPDeviceClauseModifier Modifier, + Expr *Device, SourceLocation StartLoc, SourceLocation LParenLoc, + SourceLocation ModifierLoc, SourceLocation EndLoc); /// Called on well-formed 'map' clause. OMPClause * diff --git a/clang/include/clang/StaticAnalyzer/Frontend/CheckerRegistry.h b/clang/include/clang/StaticAnalyzer/Frontend/CheckerRegistry.h index 8f0c7edc58b43a..8830542f27d827 100644 --- a/clang/include/clang/StaticAnalyzer/Frontend/CheckerRegistry.h +++ b/clang/include/clang/StaticAnalyzer/Frontend/CheckerRegistry.h @@ -167,7 +167,7 @@ class CheckerRegistry { } bool isDisabled(const LangOptions &LO) const { - return State == StateFromCmdLine::State_Disabled && ShouldRegister(LO); + return State == StateFromCmdLine::State_Disabled || !ShouldRegister(LO); } // Since each checker must have a different full name, we can identify diff --git a/clang/include/clang/module.modulemap b/clang/include/clang/module.modulemap index af1322acc289fa..15f891c1534015 100644 --- a/clang/include/clang/module.modulemap +++ b/clang/include/clang/module.modulemap @@ -45,6 +45,7 @@ module Clang_Basic { textual header "Basic/BuiltinsNEON.def" textual header "Basic/BuiltinsNVPTX.def" textual header "Basic/BuiltinsPPC.def" + textual header "Basic/BuiltinsSVE.def" textual header "Basic/BuiltinsSystemZ.def" textual header "Basic/BuiltinsWebAssembly.def" textual header "Basic/BuiltinsX86.def" diff --git a/clang/lib/AST/ComputeDependence.cpp b/clang/lib/AST/ComputeDependence.cpp index 4ca4eacde8b77d..348fae1cfb192b 100644 --- a/clang/lib/AST/ComputeDependence.cpp +++ b/clang/lib/AST/ComputeDependence.cpp @@ -120,9 +120,9 @@ ExprDependence clang::computeDependence(BinaryConditionalOperator *E) { } ExprDependence clang::computeDependence(StmtExpr *E, unsigned TemplateDepth) { - auto D = ExprDependence::None; - if (E->getType()->isDependentType()) - D |= ExprDependence::Type; + // FIXME: why is unexpanded-pack not propagated? + auto D = toExprDependence(E->getType()->getDependence()) & + ~ExprDependence::UnexpandedPack; // Note: we treat a statement-expression in a dependent context as always // being value- and instantiation-dependent. This matches the behavior of // lambda-expressions and GCC. @@ -172,7 +172,7 @@ ExprDependence clang::computeDependence(VAArgExpr *E) { ExprDependence clang::computeDependence(NoInitExpr *E) { return toExprDependence(E->getType()->getDependence()) & - ExprDependence::Instantiation; + (ExprDependence::Instantiation & ExprDependence::Error); } ExprDependence clang::computeDependence(ArrayInitLoopExpr *E) { @@ -213,8 +213,8 @@ ExprDependence clang::computeDependence(CXXRewrittenBinaryOperator *E) { ExprDependence clang::computeDependence(CXXStdInitializerListExpr *E) { auto D = turnTypeToValueDependence(E->getSubExpr()->getDependence()); - if (E->getType()->isDependentType()) - D |= ExprDependence::Type; + D |= toExprDependence(E->getType()->getDependence()) & + (ExprDependence::Type | ExprDependence::Error); return D; } @@ -296,13 +296,19 @@ ExprDependence clang::computeDependence(CXXNoexceptExpr *E, CanThrowResult CT) { return D; } +ExprDependence clang::computeDependence(PackExpansionExpr *E) { + return (E->getPattern()->getDependence() & ~ExprDependence::UnexpandedPack) | + ExprDependence::TypeValueInstantiation; +} + ExprDependence clang::computeDependence(SubstNonTypeTemplateParmExpr *E) { return E->getReplacement()->getDependence(); } ExprDependence clang::computeDependence(CoroutineSuspendExpr *E) { if (auto *Resume = E->getResumeExpr()) - return (Resume->getDependence() & ExprDependence::TypeValue) | + return (Resume->getDependence() & + (ExprDependence::TypeValue | ExprDependence::Error)) | (E->getCommonExpr()->getDependence() & ~ExprDependence::TypeValue); return E->getCommonExpr()->getDependence() | ExprDependence::TypeValueInstantiation; @@ -377,6 +383,7 @@ ExprDependence clang::computeDependence(DeclRefExpr *E, const ASTContext &Ctx) { if (Decl->isParameterPack()) Deps |= ExprDependence::UnexpandedPack; + Deps |= toExprDependence(Type->getDependence()) & ExprDependence::Error; // (TD) C++ [temp.dep.expr]p3: // An id-expression is type-dependent if it contains: @@ -496,6 +503,10 @@ ExprDependence clang::computeDependence(GenericSelectionExpr *E, bool ContainsUnexpandedPack) { auto D = ContainsUnexpandedPack ? ExprDependence::UnexpandedPack : ExprDependence::None; + for (auto *AE : E->getAssocExprs()) + D |= AE->getDependence() & ExprDependence::Error; + D |= E->getControllingExpr()->getDependence() & ExprDependence::Error; + if (E->isResultDependent()) return D | ExprDependence::TypeValueInstantiation; return D | (E->getResultExpr()->getDependence() & @@ -623,7 +634,8 @@ ExprDependence clang::computeDependence(CXXUnresolvedConstructExpr *E) { if (E->getType()->getContainedDeducedType()) D |= ExprDependence::Type; for (auto *A : E->arguments()) - D |= A->getDependence() & ExprDependence::UnexpandedPack; + D |= A->getDependence() & + (ExprDependence::UnexpandedPack | ExprDependence::Error); return D; } @@ -643,6 +655,15 @@ ExprDependence clang::computeDependence(MaterializeTemporaryExpr *E) { return E->getSubExpr()->getDependence(); } +ExprDependence clang::computeDependence(CXXFoldExpr *E) { + auto D = ExprDependence::TypeValueInstantiation; + for (const auto *C : {E->getLHS(), E->getRHS()}) { + if (C) + D |= C->getDependence() & ~ExprDependence::UnexpandedPack; + } + return D; +} + ExprDependence clang::computeDependence(TypeTraitExpr *E) { auto D = ExprDependence::None; for (const auto *A : E->getArgs()) diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index 6591b0481d4b33..9b0cc2b69dd434 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -4231,6 +4231,7 @@ DesignatedInitUpdateExpr::DesignatedInitUpdateExpr(const ASTContext &C, ILE->setType(baseExpr->getType()); BaseAndUpdaterExprs[1] = ILE; + // FIXME: this is wrong, set it correctly. setDependence(ExprDependence::None); } diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp index b01aae43376391..a9578906705620 100644 --- a/clang/lib/AST/OpenMPClause.cpp +++ b/clang/lib/AST/OpenMPClause.cpp @@ -1433,6 +1433,11 @@ void OMPClausePrinter::VisitOMPSIMDClause(OMPSIMDClause *) { OS << "simd"; } void OMPClausePrinter::VisitOMPDeviceClause(OMPDeviceClause *Node) { OS << "device("; + OpenMPDeviceClauseModifier Modifier = Node->getModifier(); + if (Modifier != OMPC_DEVICE_unknown) { + OS << getOpenMPSimpleClauseTypeName(Node->getClauseKind(), Modifier) + << ": "; + } Node->getDevice()->printPretty(OS, nullptr, Policy, 0); OS << ")"; } diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp index 0b86dbb8734759..6a6d8692228afc 100644 --- a/clang/lib/AST/TextNodeDumper.cpp +++ b/clang/lib/AST/TextNodeDumper.cpp @@ -126,6 +126,11 @@ void TextNodeDumper::Visit(const Stmt *Node) { if (const auto *E = dyn_cast(Node)) { dumpType(E->getType()); + if (E->containsErrors()) { + ColorScope Color(OS, ShowColors, ErrorsColor); + OS << " contains-errors"; + } + { ColorScope Color(OS, ShowColors, ValueKindColor); switch (E->getValueKind()) { diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp index a6e2b9dbf1a19d..f2106531d6eb1e 100644 --- a/clang/lib/Basic/OpenMPKinds.cpp +++ b/clang/lib/Basic/OpenMPKinds.cpp @@ -154,6 +154,11 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind, #define OPENMP_DEPEND_KIND(Name) .Case(#Name, OMPC_DEPEND_##Name) #include "clang/Basic/OpenMPKinds.def" .Default(OMPC_DEPEND_unknown); + case OMPC_device: + return llvm::StringSwitch(Str) +#define OPENMP_DEVICE_MODIFIER(Name) .Case(#Name, OMPC_DEVICE_##Name) +#include "clang/Basic/OpenMPKinds.def" + .Default(OMPC_DEVICE_unknown); case OMPC_unknown: case OMPC_threadprivate: case OMPC_if: @@ -187,7 +192,6 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind, case OMPC_acquire: case OMPC_release: case OMPC_relaxed: - case OMPC_device: case OMPC_threads: case OMPC_simd: case OMPC_num_teams: @@ -380,6 +384,16 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind, #include "clang/Basic/OpenMPKinds.def" } llvm_unreachable("Invalid OpenMP 'depend' clause type"); + case OMPC_device: + switch (Type) { + case OMPC_DEVICE_unknown: + return "unknown"; +#define OPENMP_DEVICE_MODIFIER(Name) \ + case OMPC_DEVICE_##Name: \ + return #Name; +#include "clang/Basic/OpenMPKinds.def" + } + llvm_unreachable("Invalid OpenMP 'device' clause modifier"); case OMPC_unknown: case OMPC_threadprivate: case OMPC_if: @@ -413,7 +427,6 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind, case OMPC_acquire: case OMPC_release: case OMPC_relaxed: - case OMPC_device: case OMPC_threads: case OMPC_simd: case OMPC_num_teams: diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp index c063f8ca44720b..9181c715085e09 100644 --- a/clang/lib/Basic/Targets.cpp +++ b/clang/lib/Basic/Targets.cpp @@ -117,6 +117,9 @@ TargetInfo *AllocateTarget(const llvm::Triple &Triple, return new XCoreTargetInfo(Triple, Opts); case llvm::Triple::hexagon: + if (os == llvm::Triple::Linux && + Triple.getEnvironment() == llvm::Triple::Musl) + return new LinuxTargetInfo(Triple, Opts); return new HexagonTargetInfo(Triple, Opts); case llvm::Triple::lanai: diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 28e4ecc7b4bf32..e8f2524a25d5e9 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -491,7 +491,6 @@ static void initTargetOptions(llvm::TargetOptions &Options, Options.DebuggerTuning = CodeGenOpts.getDebuggerTuning(); Options.EmitStackSizeSection = CodeGenOpts.StackSizeSection; Options.EmitAddrsig = CodeGenOpts.Addrsig; - Options.EnableDebugEntryValues = CodeGenOpts.EnableDebugEntryValues; Options.ForceDwarfFrameSection = CodeGenOpts.ForceDwarfFrameSection; Options.EmitCallSiteInfo = CodeGenOpts.EmitCallSiteInfo; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index b0be1ecb545446..e42339dbcfccb8 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -4495,10 +4495,15 @@ static llvm::VectorType *GetFloatNeonType(CodeGenFunction *CGF, } } +Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C, + const ElementCount &Count) { + Value *SV = llvm::ConstantVector::getSplat(Count, C); + return Builder.CreateShuffleVector(V, V, SV, "lane"); +} + Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C) { ElementCount EC = V->getType()->getVectorElementCount(); - Value *SV = llvm::ConstantVector::getSplat(EC, C); - return Builder.CreateShuffleVector(V, V, SV, "lane"); + return EmitNeonSplat(V, C, EC); } Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl &Ops, @@ -4605,6 +4610,10 @@ struct ARMVectorIntrinsicInfo { TypeModifier } static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = { + NEONMAP0(splat_lane_v), + NEONMAP0(splat_laneq_v), + NEONMAP0(splatq_lane_v), + NEONMAP0(splatq_laneq_v), NEONMAP2(vabd_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts), NEONMAP2(vabdq_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts), NEONMAP1(vabs_v, arm_neon_vabs, 0), @@ -4886,6 +4895,10 @@ static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = { }; static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = { + NEONMAP0(splat_lane_v), + NEONMAP0(splat_laneq_v), + NEONMAP0(splatq_lane_v), + NEONMAP0(splatq_laneq_v), NEONMAP1(vabs_v, aarch64_neon_abs, 0), NEONMAP1(vabsq_v, aarch64_neon_abs, 0), NEONMAP0(vaddhn_v), @@ -5460,6 +5473,19 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( switch (BuiltinID) { default: break; + case NEON::BI__builtin_neon_splat_lane_v: + case NEON::BI__builtin_neon_splat_laneq_v: + case NEON::BI__builtin_neon_splatq_lane_v: + case NEON::BI__builtin_neon_splatq_laneq_v: { + auto NumElements = VTy->getElementCount(); + if (BuiltinID == NEON::BI__builtin_neon_splatq_lane_v) + NumElements = NumElements * 2; + if (BuiltinID == NEON::BI__builtin_neon_splat_laneq_v) + NumElements = NumElements / 2; + + Ops[0] = Builder.CreateBitCast(Ops[0], VTy); + return EmitNeonSplat(Ops[0], cast(Ops[1]), NumElements); + } case NEON::BI__builtin_neon_vpadd_v: case NEON::BI__builtin_neon_vpaddq_v: // We don't allow fp/int overloading of intrinsics. @@ -5798,9 +5824,14 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( case NEON::BI__builtin_neon_vqdmulh_lane_v: case NEON::BI__builtin_neon_vqrdmulhq_lane_v: case NEON::BI__builtin_neon_vqrdmulh_lane_v: { + llvm::Type *RTy = Ty; + if (BuiltinID == NEON::BI__builtin_neon_vqdmulhq_lane_v || + BuiltinID == NEON::BI__builtin_neon_vqrdmulhq_lane_v) + RTy = llvm::VectorType::get(Ty->getVectorElementType(), + Ty->getVectorNumElements() * 2); llvm::Type *Tys[2] = { - Ty, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, - /*isQuad*/ false))}; + RTy, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, + /*isQuad*/ false))}; return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint); } case NEON::BI__builtin_neon_vqdmulhq_laneq_v: diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index 94dab4c8561469..eeb1927177c50d 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -4883,8 +4883,7 @@ llvm::DINode::DIFlags CGDebugInfo::getCallSiteRelatedAttrs() const { (CGM.getCodeGenOpts().getDebuggerTuning() == llvm::DebuggerKind::LLDB || CGM.getCodeGenOpts().getDebuggerTuning() == llvm::DebuggerKind::GDB); - if (!SupportsDWARFv4Ext && CGM.getCodeGenOpts().DwarfVersion < 5 && - !CGM.getCodeGenOpts().EnableDebugEntryValues) + if (!SupportsDWARFv4Ext && CGM.getCodeGenOpts().DwarfVersion < 5) return llvm::DINode::FlagZero; return llvm::DINode::FlagAllCallsDescribed; diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 6457852f1b6d75..671ada019cec02 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -866,8 +866,12 @@ void CodeGenFunction::EmitTypeCheck(TypeCheckKind TCK, SourceLocation Loc, static bool isFlexibleArrayMemberExpr(const Expr *E) { // For compatibility with existing code, we treat arrays of length 0 or // 1 as flexible array members. + // FIXME: This is inconsistent with the warning code in SemaChecking. Unify + // the two mechanisms. const ArrayType *AT = E->getType()->castAsArrayTypeUnsafe(); if (const auto *CAT = dyn_cast(AT)) { + // FIXME: Sema doesn't treat [1] as a flexible array member if the bound + // was produced by macro expansion. if (CAT->getSize().ugt(1)) return false; } else if (!isa(AT)) @@ -880,6 +884,10 @@ static bool isFlexibleArrayMemberExpr(const Expr *E) { // FIXME: If the base type of the member expr is not FD->getParent(), // this should not be treated as a flexible array member access. if (const auto *FD = dyn_cast(ME->getMemberDecl())) { + // FIXME: Sema doesn't treat a T[1] union member as a flexible array + // member, only a T[0] or T[] member gets that treatment. + if (FD->getParent()->isUnion()) + return true; RecordDecl::field_iterator FI( DeclContext::decl_iterator(const_cast(FD))); return ++FI == FD->getParent()->field_end(); diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index fa60221e8b5931..c4e4578c4b04c4 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -9483,7 +9483,7 @@ void CGOpenMPRuntime::emitTargetNumIterationsCall( void CGOpenMPRuntime::emitTargetCall( CodeGenFunction &CGF, const OMPExecutableDirective &D, llvm::Function *OutlinedFn, llvm::Value *OutlinedFnID, const Expr *IfCond, - const Expr *Device, + llvm::PointerIntPair Device, llvm::function_ref SizeEmitter) { @@ -9507,6 +9507,16 @@ void CGOpenMPRuntime::emitTargetCall( auto &&ThenGen = [this, Device, OutlinedFn, OutlinedFnID, &D, &InputInfo, &MapTypesArray, &CS, RequiresOuterTask, &CapturedVars, SizeEmitter](CodeGenFunction &CGF, PrePostActionTy &) { + if (Device.getInt() == OMPC_DEVICE_ancestor) { + // Reverse offloading is not supported, so just execute on the host. + if (RequiresOuterTask) { + CapturedVars.clear(); + CGF.GenerateOpenMPCapturedVars(CS, CapturedVars); + } + emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedFn, CapturedVars); + return; + } + // On top of the arrays that were filled up, the target offloading call // takes as arguments the device id as well as the host pointer. The host // pointer is used by the runtime library to identify the current target @@ -9521,9 +9531,13 @@ void CGOpenMPRuntime::emitTargetCall( // Emit device ID if any. llvm::Value *DeviceID; - if (Device) { - DeviceID = CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(Device), - CGF.Int64Ty, /*isSigned=*/true); + if (Device.getPointer()) { + assert((Device.getInt() == OMPC_DEVICE_unknown || + Device.getInt() == OMPC_DEVICE_device_num) && + "Expected device_num modifier."); + llvm::Value *DevVal = CGF.EmitScalarExpr(Device.getPointer()); + DeviceID = + CGF.Builder.CreateIntCast(DevVal, CGF.Int64Ty, /*isSigned=*/true); } else { DeviceID = CGF.Builder.getInt64(OMP_DEVICEID_UNDEF); } @@ -12135,7 +12149,7 @@ void CGOpenMPSIMDRuntime::emitTargetOutlinedFunction( void CGOpenMPSIMDRuntime::emitTargetCall( CodeGenFunction &CGF, const OMPExecutableDirective &D, llvm::Function *OutlinedFn, llvm::Value *OutlinedFnID, const Expr *IfCond, - const Expr *Device, + llvm::PointerIntPair Device, llvm::function_ref SizeEmitter) { diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h index b9df5ee1147354..99b201bcf6feab 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.h +++ b/clang/lib/CodeGen/CGOpenMPRuntime.h @@ -20,6 +20,7 @@ #include "clang/Basic/OpenMPKinds.h" #include "clang/Basic/SourceLocation.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringSet.h" @@ -1504,16 +1505,16 @@ class CGOpenMPRuntime { /// \param IfCond Expression evaluated in if clause associated with the target /// directive, or null if no if clause is used. /// \param Device Expression evaluated in device clause associated with the - /// target directive, or null if no device clause is used. + /// target directive, or null if no device clause is used and device modifier. /// \param SizeEmitter Callback to emit number of iterations for loop-based /// directives. - virtual void - emitTargetCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, - llvm::Function *OutlinedFn, llvm::Value *OutlinedFnID, - const Expr *IfCond, const Expr *Device, - llvm::function_ref - SizeEmitter); + virtual void emitTargetCall( + CodeGenFunction &CGF, const OMPExecutableDirective &D, + llvm::Function *OutlinedFn, llvm::Value *OutlinedFnID, const Expr *IfCond, + llvm::PointerIntPair Device, + llvm::function_ref + SizeEmitter); /// Emit the target regions enclosed in \a GD function definition or /// the function itself in case it is a valid device function. Returns true if @@ -2275,14 +2276,14 @@ class CGOpenMPSIMDRuntime final : public CGOpenMPRuntime { /// \param IfCond Expression evaluated in if clause associated with the target /// directive, or null if no if clause is used. /// \param Device Expression evaluated in device clause associated with the - /// target directive, or null if no device clause is used. - void - emitTargetCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, - llvm::Function *OutlinedFn, llvm::Value *OutlinedFnID, - const Expr *IfCond, const Expr *Device, - llvm::function_ref - SizeEmitter) override; + /// target directive, or null if no device clause is used and device modifier. + void emitTargetCall( + CodeGenFunction &CGF, const OMPExecutableDirective &D, + llvm::Function *OutlinedFn, llvm::Value *OutlinedFnID, const Expr *IfCond, + llvm::PointerIntPair Device, + llvm::function_ref + SizeEmitter) override; /// Emit the target regions enclosed in \a GD function definition or /// the function itself in case it is a valid device function. Returns true if diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 74fabb1ba04e72..37bdb53bb1dc2e 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -4724,9 +4724,10 @@ static void emitCommonOMPTargetDirective(CodeGenFunction &CGF, } // Check if we have any device clause associated with the directive. - const Expr *Device = nullptr; + llvm::PointerIntPair Device( + nullptr, OMPC_DEVICE_unknown); if (auto *C = S.getSingleClause()) - Device = C->getDevice(); + Device.setPointerAndInt(C->getDevice(), C->getModifier()); // Check if we have an if clause whose conditional always evaluates to false // or if we do not have any targets specified. If so the target region is not diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 440b088330efb2..0128a07030c515 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -3894,6 +3894,8 @@ class CodeGenFunction : public CodeGenTypeCache { SmallVectorImpl &O, const char *name, unsigned shift = 0, bool rightshift = false); + llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx, + const llvm::ElementCount &Count); llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx); llvm::Value *EmitNeonShiftVector(llvm::Value *V, llvm::Type *Ty, bool negateForRightShift); diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp index b7dd6793efeafc..e0e25479bf5954 100644 --- a/clang/lib/Driver/ToolChains/MSVC.cpp +++ b/clang/lib/Driver/ToolChains/MSVC.cpp @@ -1483,7 +1483,8 @@ static void TranslateDArg(Arg *A, llvm::opt::DerivedArgList &DAL, llvm::opt::DerivedArgList * MSVCToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args, - StringRef BoundArch, Action::OffloadKind) const { + StringRef BoundArch, + Action::OffloadKind OFK) const { DerivedArgList *DAL = new DerivedArgList(Args.getBaseArgs()); const OptTable &Opts = getDriver().getOpts(); @@ -1522,7 +1523,8 @@ MSVCToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args, } else if (A->getOption().matches(options::OPT_D)) { // Translate -Dfoo#bar into -Dfoo=bar. TranslateDArg(A, *DAL, Opts); - } else { + } else if (OFK != Action::OFK_HIP) { + // HIP Toolchain translates input args by itself. DAL->append(A); } } diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h index 1b885b518f0d04..10a5f0e96f96f6 100644 --- a/clang/lib/Format/FormatToken.h +++ b/clang/lib/Format/FormatToken.h @@ -108,6 +108,9 @@ namespace format { TYPE(CSharpNullCoalescing) \ TYPE(CSharpNullConditional) \ TYPE(CSharpNullConditionalLSquare) \ + TYPE(CSharpGenericTypeConstraint) \ + TYPE(CSharpGenericTypeConstraintColon) \ + TYPE(CSharpGenericTypeConstraintComma) \ TYPE(Unknown) enum TokenType { @@ -779,6 +782,7 @@ struct AdditionalKeywords { kw_unsafe = &IdentTable.get("unsafe"); kw_ushort = &IdentTable.get("ushort"); kw_when = &IdentTable.get("when"); + kw_where = &IdentTable.get("where"); // Keep this at the end of the constructor to make sure everything here // is @@ -796,6 +800,7 @@ struct AdditionalKeywords { kw_is, kw_lock, kw_null, kw_object, kw_out, kw_override, kw_params, kw_readonly, kw_ref, kw_string, kw_stackalloc, kw_sbyte, kw_sealed, kw_uint, kw_ulong, kw_unchecked, kw_unsafe, kw_ushort, kw_when, + kw_where, // Keywords from the JavaScript section. kw_as, kw_async, kw_await, kw_declare, kw_finally, kw_from, kw_function, kw_get, kw_import, kw_is, kw_let, kw_module, kw_readonly, @@ -900,6 +905,7 @@ struct AdditionalKeywords { IdentifierInfo *kw_unsafe; IdentifierInfo *kw_ushort; IdentifierInfo *kw_when; + IdentifierInfo *kw_where; /// Returns \c true if \p Tok is a true JavaScript identifier, returns /// \c false if it is a keyword or a pseudo keyword. diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index d546a9f7c60679..7193c8e6de448f 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -1047,6 +1047,11 @@ class AnnotatingParser { Keywords.kw___has_include_next)) { parseHasInclude(); } + if (Tok->is(Keywords.kw_where) && Tok->Next && + Tok->Next->isNot(tok::l_paren)) { + Tok->Type = TT_CSharpGenericTypeConstraint; + parseCSharpGenericTypeConstraint(); + } break; default: break; @@ -1054,6 +1059,30 @@ class AnnotatingParser { return true; } + void parseCSharpGenericTypeConstraint() { + while (CurrentToken) { + if (CurrentToken->is(tok::less)) { + // parseAngle is too greedy and will consume the whole line. + CurrentToken->Type = TT_TemplateOpener; + next(); + } else if (CurrentToken->is(tok::greater)) { + CurrentToken->Type = TT_TemplateCloser; + next(); + } else if (CurrentToken->is(tok::comma)) { + CurrentToken->Type = TT_CSharpGenericTypeConstraintComma; + next(); + } else if (CurrentToken->is(Keywords.kw_where)) { + CurrentToken->Type = TT_CSharpGenericTypeConstraint; + next(); + } else if (CurrentToken->is(tok::colon)) { + CurrentToken->Type = TT_CSharpGenericTypeConstraintColon; + next(); + } else { + next(); + } + } + } + void parseIncludeDirective() { if (CurrentToken && CurrentToken->is(tok::less)) { next(); @@ -3299,6 +3328,8 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line, if (Right.is(TT_CSharpNamedArgumentColon) || Left.is(TT_CSharpNamedArgumentColon)) return false; + if (Right.is(TT_CSharpGenericTypeConstraint)) + return true; } else if (Style.Language == FormatStyle::LK_JavaScript) { // FIXME: This might apply to other languages and token kinds. if (Right.is(tok::string_literal) && Left.is(tok::plus) && Left.Previous && diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp index 84ccbec2150d12..a81d480c8e64fd 100644 --- a/clang/lib/Format/UnwrappedLineFormatter.cpp +++ b/clang/lib/Format/UnwrappedLineFormatter.cpp @@ -64,6 +64,8 @@ class LevelIndentTracker { } if (static_cast(Indent) + Offset >= 0) Indent += Offset; + if (Line.First->is(TT_CSharpGenericTypeConstraint)) + Indent = Line.Level * Style.IndentWidth + Style.ContinuationIndentWidth; } /// Update the indent state given that \p Line indent should be diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 00447ebdf5a92b..e2a6389cb26dfb 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -323,6 +323,24 @@ void UnwrappedLineParser::parseFile() { addUnwrappedLine(); } +void UnwrappedLineParser::parseCSharpGenericTypeConstraint() { + do { + switch (FormatTok->Tok.getKind()) { + case tok::l_brace: + return; + default: + if (FormatTok->is(Keywords.kw_where)) { + addUnwrappedLine(); + nextToken(); + parseCSharpGenericTypeConstraint(); + break; + } + nextToken(); + break; + } + } while (!eof()); +} + void UnwrappedLineParser::parseCSharpAttribute() { int UnpairedSquareBrackets = 1; do { @@ -1344,6 +1362,12 @@ void UnwrappedLineParser::parseStructuralElement() { parseTryCatch(); return; case tok::identifier: { + if (Style.isCSharp() && FormatTok->is(Keywords.kw_where) && + Line->MustBeDeclaration) { + addUnwrappedLine(); + parseCSharpGenericTypeConstraint(); + break; + } if (FormatTok->is(TT_MacroBlockEnd)) { addUnwrappedLine(); return; diff --git a/clang/lib/Format/UnwrappedLineParser.h b/clang/lib/Format/UnwrappedLineParser.h index e184cf5354fd1a..42b8b51a37cc03 100644 --- a/clang/lib/Format/UnwrappedLineParser.h +++ b/clang/lib/Format/UnwrappedLineParser.h @@ -126,6 +126,10 @@ class UnwrappedLineParser { void parseJavaScriptEs6ImportExport(); void parseStatementMacro(); void parseCSharpAttribute(); + // Parse a C# generic type constraint: `where T : IComparable`. + // See: + // https://docs.microsoft.com/en-us/dotnet/csharp/language-reference/keywords/where-generic-type-constraint + void parseCSharpGenericTypeConstraint(); bool tryToParseLambda(); bool tryToParseLambdaIntroducer(); void tryToParseJSFunction(); diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index e2b24f0cfcea1d..2e432ad9ce1481 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -788,10 +788,8 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK, llvm::Triple T(TargetOpts.Triple); if (Opts.OptimizationLevel > 0 && Opts.hasReducedDebugInfo() && - llvm::is_contained(DebugEntryValueArchs, T.getArch())) { - Opts.EnableDebugEntryValues = Args.hasArg(OPT_femit_debug_entry_values); + llvm::is_contained(DebugEntryValueArchs, T.getArch())) Opts.EmitCallSiteInfo = true; - } Opts.DisableO0ImplyOptNone = Args.hasArg(OPT_disable_O0_optnone); Opts.DisableRedZone = Args.hasArg(OPT_disable_red_zone); diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index 17298dad456405..ba4f5d86612a36 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -4421,7 +4421,8 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS, ColonProtectionRAIIObject X(*this, AllowDeclaration); CXXScopeSpec Spec; - if (ParseOptionalCXXScopeSpecifier(Spec, nullptr, + if (ParseOptionalCXXScopeSpecifier(Spec, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/true)) return; @@ -5254,7 +5255,8 @@ bool Parser::isConstructorDeclarator(bool IsUnqualified, bool DeductionGuide) { // Parse the C++ scope specifier. CXXScopeSpec SS; - if (ParseOptionalCXXScopeSpecifier(SS, nullptr, + if (ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/true)) { TPA.Revert(); return false; @@ -5634,7 +5636,8 @@ void Parser::ParseDeclaratorInternal(Declarator &D, D.getContext() == DeclaratorContext::FileContext || D.getContext() == DeclaratorContext::MemberContext; CXXScopeSpec SS; - ParseOptionalCXXScopeSpecifier(SS, nullptr, EnteringContext); + ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, EnteringContext); if (SS.isNotEmpty()) { if (Tok.isNot(tok::star)) { @@ -5857,8 +5860,9 @@ void Parser::ParseDirectDeclarator(Declarator &D) { bool EnteringContext = D.getContext() == DeclaratorContext::FileContext || D.getContext() == DeclaratorContext::MemberContext; - ParseOptionalCXXScopeSpecifier(D.getCXXScopeSpec(), nullptr, - EnteringContext); + ParseOptionalCXXScopeSpecifier( + D.getCXXScopeSpec(), /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, EnteringContext); } if (D.getCXXScopeSpec().isValid()) { @@ -5932,10 +5936,11 @@ void Parser::ParseDirectDeclarator(Declarator &D) { bool HadScope = D.getCXXScopeSpec().isValid(); if (ParseUnqualifiedId(D.getCXXScopeSpec(), + /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/true, /*AllowDestructorName=*/true, AllowConstructorName, - AllowDeductionGuide, nullptr, nullptr, - D.getName()) || + AllowDeductionGuide, nullptr, D.getName()) || // Once we're past the identifier, if the scope was bad, mark the // whole declarator bad. D.getCXXScopeSpec().isInvalid()) { diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp index 09e5c7996fcd53..85dc4e3e706a8b 100644 --- a/clang/lib/Parse/ParseDeclCXX.cpp +++ b/clang/lib/Parse/ParseDeclCXX.cpp @@ -290,7 +290,9 @@ Decl *Parser::ParseNamespaceAlias(SourceLocation NamespaceLoc, CXXScopeSpec SS; // Parse (optional) nested-name-specifier. - ParseOptionalCXXScopeSpecifier(SS, nullptr, /*EnteringContext=*/false, + ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + /*EnteringContext=*/false, /*MayBePseudoDestructor=*/nullptr, /*IsTypename=*/false, /*LastII=*/nullptr, @@ -530,7 +532,9 @@ Decl *Parser::ParseUsingDirective(DeclaratorContext Context, CXXScopeSpec SS; // Parse (optional) nested-name-specifier. - ParseOptionalCXXScopeSpecifier(SS, nullptr, /*EnteringContext=*/false, + ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + /*EnteringContext=*/false, /*MayBePseudoDestructor=*/nullptr, /*IsTypename=*/false, /*LastII=*/nullptr, @@ -597,7 +601,9 @@ bool Parser::ParseUsingDeclarator(DeclaratorContext Context, // Parse nested-name-specifier. IdentifierInfo *LastII = nullptr; - if (ParseOptionalCXXScopeSpecifier(D.SS, nullptr, /*EnteringContext=*/false, + if (ParseOptionalCXXScopeSpecifier(D.SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + /*EnteringContext=*/false, /*MayBePseudoDtor=*/nullptr, /*IsTypename=*/false, /*LastII=*/&LastII, @@ -632,12 +638,12 @@ bool Parser::ParseUsingDeclarator(DeclaratorContext Context, D.Name.setConstructorName(Type, IdLoc, IdLoc); } else { if (ParseUnqualifiedId( - D.SS, /*EnteringContext=*/false, + D.SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/false, /*AllowDestructorName=*/true, - /*AllowConstructorName=*/!(Tok.is(tok::identifier) && - NextToken().is(tok::equal)), - /*AllowDeductionGuide=*/false, - nullptr, nullptr, D.Name)) + /*AllowConstructorName=*/ + !(Tok.is(tok::identifier) && NextToken().is(tok::equal)), + /*AllowDeductionGuide=*/false, nullptr, D.Name)) return true; } @@ -1115,7 +1121,9 @@ TypeResult Parser::ParseBaseTypeSpecifier(SourceLocation &BaseLoc, // Parse optional nested-name-specifier CXXScopeSpec SS; - if (ParseOptionalCXXScopeSpecifier(SS, nullptr, /*EnteringContext=*/false)) + if (ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + /*EnteringContext=*/false)) return true; BaseLoc = Tok.getLocation(); @@ -1547,7 +1555,9 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind, CXXScopeSpec Spec; bool HasValidSpec = true; - if (ParseOptionalCXXScopeSpecifier(Spec, nullptr, EnteringContext)) { + if (ParseOptionalCXXScopeSpecifier(Spec, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + EnteringContext)) { DS.SetTypeSpecError(); HasValidSpec = false; } @@ -2501,7 +2511,8 @@ Parser::ParseCXXClassMemberDeclaration(AccessSpecifier AS, if (isAccessDecl) { // Collect the scope specifier token we annotated earlier. CXXScopeSpec SS; - ParseOptionalCXXScopeSpecifier(SS, nullptr, + ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/false); if (SS.isInvalid()) { @@ -2512,8 +2523,9 @@ Parser::ParseCXXClassMemberDeclaration(AccessSpecifier AS, // Try to parse an unqualified-id. SourceLocation TemplateKWLoc; UnqualifiedId Name; - if (ParseUnqualifiedId(SS, false, true, true, false, nullptr, - &TemplateKWLoc, Name)) { + if (ParseUnqualifiedId(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, false, true, true, + false, &TemplateKWLoc, Name)) { SkipUntil(tok::semi); return nullptr; } @@ -3493,7 +3505,9 @@ void Parser::ParseConstructorInitializer(Decl *ConstructorDecl) { MemInitResult Parser::ParseMemInitializer(Decl *ConstructorDecl) { // parse '::'[opt] nested-name-specifier[opt] CXXScopeSpec SS; - if (ParseOptionalCXXScopeSpecifier(SS, nullptr, /*EnteringContext=*/false)) + if (ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + /*EnteringContext=*/false)) return true; // : identifier diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp index b038e6935d8736..7bd1230a77500f 100644 --- a/clang/lib/Parse/ParseExpr.cpp +++ b/clang/lib/Parse/ParseExpr.cpp @@ -1529,7 +1529,8 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind, // type, translate it into a type and continue parsing as a // cast expression. CXXScopeSpec SS; - ParseOptionalCXXScopeSpecifier(SS, nullptr, + ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/false); AnnotateTemplateIdTokenAsType(SS); return ParseCastExpression(ParseKind, isAddressOfOperand, NotCastExpr, @@ -1983,9 +1984,9 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) { if (LHS.isInvalid()) break; - ParseOptionalCXXScopeSpecifier(SS, ObjectType, - /*EnteringContext=*/false, - &MayBePseudoDestructor); + ParseOptionalCXXScopeSpecifier( + SS, ObjectType, LHS.get() && LHS.get()->containsErrors(), + /*EnteringContext=*/false, &MayBePseudoDestructor); if (SS.isNotEmpty()) ObjectType = nullptr; } @@ -2045,14 +2046,13 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) { IdentifierInfo *Id = Tok.getIdentifierInfo(); SourceLocation Loc = ConsumeToken(); Name.setIdentifier(Id, Loc); - } else if (ParseUnqualifiedId(SS, - /*EnteringContext=*/false, - /*AllowDestructorName=*/true, - /*AllowConstructorName=*/ - getLangOpts().MicrosoftExt && - SS.isNotEmpty(), - /*AllowDeductionGuide=*/false, - ObjectType, &TemplateKWLoc, Name)) { + } else if (ParseUnqualifiedId( + SS, ObjectType, LHS.get() && LHS.get()->containsErrors(), + /*EnteringContext=*/false, + /*AllowDestructorName=*/true, + /*AllowConstructorName=*/ + getLangOpts().MicrosoftExt && SS.isNotEmpty(), + /*AllowDeductionGuide=*/false, &TemplateKWLoc, Name)) { (void)Actions.CorrectDelayedTyposInExpr(LHS); LHS = ExprError(); } diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp index 10608644a8fe9e..b8d91c19228f15 100644 --- a/clang/lib/Parse/ParseExprCXX.cpp +++ b/clang/lib/Parse/ParseExprCXX.cpp @@ -124,6 +124,10 @@ void Parser::CheckForTemplateAndDigraph(Token &Next, ParsedType ObjectType, /// the "." or "->" of a member access expression, this parameter provides the /// type of the object whose members are being accessed. /// +/// \param ObjectHadErrors if this unqualified-id occurs within a member access +/// expression, indicates whether the original subexpressions had any errors. +/// When true, diagnostics for missing 'template' keyword will be supressed. +/// /// \param EnteringContext whether we will be entering into the context of /// the nested-name-specifier after parsing it. /// @@ -146,14 +150,10 @@ void Parser::CheckForTemplateAndDigraph(Token &Next, ParsedType ObjectType, /// /// /// \returns true if there was an error parsing a scope specifier -bool Parser::ParseOptionalCXXScopeSpecifier(CXXScopeSpec &SS, - ParsedType ObjectType, - bool EnteringContext, - bool *MayBePseudoDestructor, - bool IsTypename, - IdentifierInfo **LastII, - bool OnlyNamespace, - bool InUsingDeclaration) { +bool Parser::ParseOptionalCXXScopeSpecifier( + CXXScopeSpec &SS, ParsedType ObjectType, bool ObjectHadErrors, + bool EnteringContext, bool *MayBePseudoDestructor, bool IsTypename, + IdentifierInfo **LastII, bool OnlyNamespace, bool InUsingDeclaration) { assert(getLangOpts().CPlusPlus && "Call sites of this function should be guarded by checking for C++"); @@ -511,17 +511,21 @@ bool Parser::ParseOptionalCXXScopeSpecifier(CXXScopeSpec &SS, if (MemberOfUnknownSpecialization && (ObjectType || SS.isSet()) && (IsTypename || isTemplateArgumentList(1) == TPResult::True)) { - // We have something like t::getAs, where getAs is a - // member of an unknown specialization. However, this will only - // parse correctly as a template, so suggest the keyword 'template' - // before 'getAs' and treat this as a dependent template name. - unsigned DiagID = diag::err_missing_dependent_template_keyword; - if (getLangOpts().MicrosoftExt) - DiagID = diag::warn_missing_dependent_template_keyword; - - Diag(Tok.getLocation(), DiagID) - << II.getName() - << FixItHint::CreateInsertion(Tok.getLocation(), "template "); + // If we had errors before, ObjectType can be dependent even without any + // templates, do not report missing template keyword in that case. + if (!ObjectHadErrors) { + // We have something like t::getAs, where getAs is a + // member of an unknown specialization. However, this will only + // parse correctly as a template, so suggest the keyword 'template' + // before 'getAs' and treat this as a dependent template name. + unsigned DiagID = diag::err_missing_dependent_template_keyword; + if (getLangOpts().MicrosoftExt) + DiagID = diag::warn_missing_dependent_template_keyword; + + Diag(Tok.getLocation(), DiagID) + << II.getName() + << FixItHint::CreateInsertion(Tok.getLocation(), "template "); + } if (TemplateNameKind TNK = Actions.ActOnDependentTemplateName( getCurScope(), SS, Tok.getLocation(), TemplateName, ObjectType, @@ -593,12 +597,12 @@ ExprResult Parser::tryParseCXXIdExpression(CXXScopeSpec &SS, default: SourceLocation TemplateKWLoc; UnqualifiedId Name; - if (ParseUnqualifiedId(SS, + if (ParseUnqualifiedId(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/false, /*AllowDestructorName=*/false, /*AllowConstructorName=*/false, - /*AllowDeductionGuide=*/false, - /*ObjectType=*/nullptr, &TemplateKWLoc, Name)) + /*AllowDeductionGuide=*/false, &TemplateKWLoc, Name)) return ExprError(); // This is only the direct operand of an & operator if it is not @@ -666,7 +670,9 @@ ExprResult Parser::ParseCXXIdExpression(bool isAddressOfOperand) { // '::' unqualified-id // CXXScopeSpec SS; - ParseOptionalCXXScopeSpecifier(SS, nullptr, /*EnteringContext=*/false); + ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + /*EnteringContext=*/false); Token Replacement; ExprResult Result = @@ -1769,10 +1775,10 @@ Parser::ParseCXXPseudoDestructor(Expr *Base, SourceLocation OpLoc, // If there is a '<', the second type name is a template-id. Parse // it as such. if (Tok.is(tok::less) && - ParseUnqualifiedIdTemplateId(SS, SourceLocation(), - Name, NameLoc, - false, ObjectType, SecondTypeName, - /*AssumeTemplateId=*/true)) + ParseUnqualifiedIdTemplateId( + SS, ObjectType, Base && Base->containsErrors(), SourceLocation(), + Name, NameLoc, false, SecondTypeName, + /*AssumeTemplateId=*/true)) return ExprError(); return Actions.ActOnPseudoDestructorExpr(getCurScope(), Base, OpLoc, OpKind, @@ -2259,6 +2265,12 @@ bool Parser::ParseCXXTypeSpecifierSeq(DeclSpec &DS) { /// \param SS the nested-name-specifier that precedes this template-id, if /// we're actually parsing a qualified-id. /// +/// \param ObjectType if this unqualified-id occurs within a member access +/// expression, the type of the base object whose member is being accessed. +/// +/// \param ObjectHadErrors this unqualified-id occurs within a member access +/// expression, indicates whether the original subexpressions had any errors. +/// /// \param Name for constructor and destructor names, this is the actual /// identifier that may be a template-name. /// @@ -2268,9 +2280,6 @@ bool Parser::ParseCXXTypeSpecifierSeq(DeclSpec &DS) { /// \param EnteringContext whether we're entering the scope of the /// nested-name-specifier. /// -/// \param ObjectType if this unqualified-id occurs within a member access -/// expression, the type of the base object whose member is being accessed. -/// /// \param Id as input, describes the template-name or operator-function-id /// that precedes the '<'. If template arguments were parsed successfully, /// will be updated with the template-id. @@ -2279,14 +2288,10 @@ bool Parser::ParseCXXTypeSpecifierSeq(DeclSpec &DS) { /// refers to a template without performing name lookup to verify. /// /// \returns true if a parse error occurred, false otherwise. -bool Parser::ParseUnqualifiedIdTemplateId(CXXScopeSpec &SS, - SourceLocation TemplateKWLoc, - IdentifierInfo *Name, - SourceLocation NameLoc, - bool EnteringContext, - ParsedType ObjectType, - UnqualifiedId &Id, - bool AssumeTemplateId) { +bool Parser::ParseUnqualifiedIdTemplateId( + CXXScopeSpec &SS, ParsedType ObjectType, bool ObjectHadErrors, + SourceLocation TemplateKWLoc, IdentifierInfo *Name, SourceLocation NameLoc, + bool EnteringContext, UnqualifiedId &Id, bool AssumeTemplateId) { assert(Tok.is(tok::less) && "Expected '<' to finish parsing a template-id"); TemplateTy Template; @@ -2318,23 +2323,27 @@ bool Parser::ParseUnqualifiedIdTemplateId(CXXScopeSpec &SS, if (TNK == TNK_Non_template && MemberOfUnknownSpecialization && ObjectType && isTemplateArgumentList(0) == TPResult::True) { - // We have something like t->getAs(), where getAs is a - // member of an unknown specialization. However, this will only - // parse correctly as a template, so suggest the keyword 'template' - // before 'getAs' and treat this as a dependent template name. - std::string Name; - if (Id.getKind() == UnqualifiedIdKind::IK_Identifier) - Name = std::string(Id.Identifier->getName()); - else { - Name = "operator "; - if (Id.getKind() == UnqualifiedIdKind::IK_OperatorFunctionId) - Name += getOperatorSpelling(Id.OperatorFunctionId.Operator); - else - Name += Id.Identifier->getName(); + // If we had errors before, ObjectType can be dependent even without any + // templates, do not report missing template keyword in that case. + if (!ObjectHadErrors) { + // We have something like t->getAs(), where getAs is a + // member of an unknown specialization. However, this will only + // parse correctly as a template, so suggest the keyword 'template' + // before 'getAs' and treat this as a dependent template name. + std::string Name; + if (Id.getKind() == UnqualifiedIdKind::IK_Identifier) + Name = std::string(Id.Identifier->getName()); + else { + Name = "operator "; + if (Id.getKind() == UnqualifiedIdKind::IK_OperatorFunctionId) + Name += getOperatorSpelling(Id.OperatorFunctionId.Operator); + else + Name += Id.Identifier->getName(); + } + Diag(Id.StartLocation, diag::err_missing_dependent_template_keyword) + << Name + << FixItHint::CreateInsertion(Id.StartLocation, "template "); } - Diag(Id.StartLocation, diag::err_missing_dependent_template_keyword) - << Name - << FixItHint::CreateInsertion(Id.StartLocation, "template "); TNK = Actions.ActOnDependentTemplateName( getCurScope(), SS, TemplateKWLoc, Id, ObjectType, EnteringContext, Template, /*AllowInjectedClassName*/ true); @@ -2691,6 +2700,13 @@ bool Parser::ParseUnqualifiedIdOperator(CXXScopeSpec &SS, bool EnteringContext, /// \param SS The nested-name-specifier that preceded this unqualified-id. If /// non-empty, then we are parsing the unqualified-id of a qualified-id. /// +/// \param ObjectType if this unqualified-id occurs within a member access +/// expression, the type of the base object whose member is being accessed. +/// +/// \param ObjectHadErrors if this unqualified-id occurs within a member access +/// expression, indicates whether the original subexpressions had any errors. +/// When true, diagnostics for missing 'template' keyword will be supressed. +/// /// \param EnteringContext whether we are entering the scope of the /// nested-name-specifier. /// @@ -2700,17 +2716,14 @@ bool Parser::ParseUnqualifiedIdOperator(CXXScopeSpec &SS, bool EnteringContext, /// /// \param AllowDeductionGuide whether we allow parsing a deduction guide name. /// -/// \param ObjectType if this unqualified-id occurs within a member access -/// expression, the type of the base object whose member is being accessed. -/// /// \param Result on a successful parse, contains the parsed unqualified-id. /// /// \returns true if parsing fails, false otherwise. -bool Parser::ParseUnqualifiedId(CXXScopeSpec &SS, bool EnteringContext, +bool Parser::ParseUnqualifiedId(CXXScopeSpec &SS, ParsedType ObjectType, + bool ObjectHadErrors, bool EnteringContext, bool AllowDestructorName, bool AllowConstructorName, bool AllowDeductionGuide, - ParsedType ObjectType, SourceLocation *TemplateKWLoc, UnqualifiedId &Result) { if (TemplateKWLoc) @@ -2769,8 +2782,9 @@ bool Parser::ParseUnqualifiedId(CXXScopeSpec &SS, bool EnteringContext, TemplateTy Template; if (Tok.is(tok::less)) return ParseUnqualifiedIdTemplateId( - SS, TemplateKWLoc ? *TemplateKWLoc : SourceLocation(), Id, IdLoc, - EnteringContext, ObjectType, Result, TemplateSpecified); + SS, ObjectType, ObjectHadErrors, + TemplateKWLoc ? *TemplateKWLoc : SourceLocation(), Id, IdLoc, + EnteringContext, Result, TemplateSpecified); else if (TemplateSpecified && Actions.ActOnDependentTemplateName( getCurScope(), SS, *TemplateKWLoc, Result, ObjectType, @@ -2847,9 +2861,9 @@ bool Parser::ParseUnqualifiedId(CXXScopeSpec &SS, bool EnteringContext, Result.getKind() == UnqualifiedIdKind::IK_LiteralOperatorId) && Tok.is(tok::less)) return ParseUnqualifiedIdTemplateId( - SS, TemplateKWLoc ? *TemplateKWLoc : SourceLocation(), nullptr, - SourceLocation(), EnteringContext, ObjectType, Result, - TemplateSpecified); + SS, ObjectType, ObjectHadErrors, + TemplateKWLoc ? *TemplateKWLoc : SourceLocation(), nullptr, + SourceLocation(), EnteringContext, Result, TemplateSpecified); else if (TemplateSpecified && Actions.ActOnDependentTemplateName( getCurScope(), SS, *TemplateKWLoc, Result, ObjectType, @@ -2899,7 +2913,8 @@ bool Parser::ParseUnqualifiedId(CXXScopeSpec &SS, bool EnteringContext, AnnotateScopeToken(SS, /*NewAnnotation*/true); SS.clear(); } - if (ParseOptionalCXXScopeSpecifier(SS, ObjectType, EnteringContext)) + if (ParseOptionalCXXScopeSpecifier(SS, ObjectType, ObjectHadErrors, + EnteringContext)) return true; if (SS.isNotEmpty()) ObjectType = nullptr; @@ -2926,8 +2941,9 @@ bool Parser::ParseUnqualifiedId(CXXScopeSpec &SS, bool EnteringContext, if (Tok.is(tok::less)) { Result.setDestructorName(TildeLoc, nullptr, ClassNameLoc); return ParseUnqualifiedIdTemplateId( - SS, TemplateKWLoc ? *TemplateKWLoc : SourceLocation(), ClassName, - ClassNameLoc, EnteringContext, ObjectType, Result, TemplateSpecified); + SS, ObjectType, ObjectHadErrors, + TemplateKWLoc ? *TemplateKWLoc : SourceLocation(), ClassName, + ClassNameLoc, EnteringContext, Result, TemplateSpecified); } // Note that this is a destructor name. diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp index 86c5996683bd4c..6f6de47aa0bb3d 100644 --- a/clang/lib/Parse/ParseOpenMP.cpp +++ b/clang/lib/Parse/ParseOpenMP.cpp @@ -2287,12 +2287,14 @@ bool Parser::ParseOpenMPSimpleVarList( NoIdentIsFound = false; if (AllowScopeSpecifier && getLangOpts().CPlusPlus && - ParseOptionalCXXScopeSpecifier(SS, nullptr, false)) { + ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, false)) { IsCorrect = false; SkipUntil(tok::comma, tok::r_paren, tok::annot_pragma_openmp_end, StopBeforeMatch); - } else if (ParseUnqualifiedId(SS, false, false, false, false, nullptr, - nullptr, Name)) { + } else if (ParseUnqualifiedId(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, false, false, + false, false, nullptr, Name)) { IsCorrect = false; SkipUntil(tok::comma, tok::r_paren, tok::annot_pragma_openmp_end, StopBeforeMatch); @@ -2363,7 +2365,6 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind, case OMPC_simdlen: case OMPC_collapse: case OMPC_ordered: - case OMPC_device: case OMPC_num_teams: case OMPC_thread_limit: case OMPC_priority: @@ -2379,8 +2380,6 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind, // Only one safelen clause can appear on a simd directive. // Only one simdlen clause can appear on a simd directive. // Only one collapse clause can appear on a simd directive. - // OpenMP [2.9.1, target data construct, Restrictions] - // At most one device clause can appear on the directive. // OpenMP [2.11.1, task Construct, Restrictions] // At most one if clause can appear on the directive. // At most one final clause can appear on the directive. @@ -2428,6 +2427,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind, Clause = ParseOpenMPSimpleClause(CKind, WrongDirective); break; + case OMPC_device: case OMPC_schedule: case OMPC_dist_schedule: case OMPC_defaultmap: @@ -2435,6 +2435,8 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind, // Only one schedule clause can appear on a loop directive. // OpenMP 4.5 [2.10.4, Restrictions, p. 106] // At most one defaultmap clause can appear on the directive. + // OpenMP 5.0 [2.12.5, target construct, Restrictions] + // At most one device clause can appear on the directive. if ((getLangOpts().OpenMP < 50 || CKind != OMPC_defaultmap) && !FirstClause) { Diag(Tok, diag::err_omp_more_one_clause) @@ -2443,7 +2445,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind, } LLVM_FALLTHROUGH; case OMPC_if: - Clause = ParseOpenMPSingleExprWithArgClause(CKind, WrongDirective); + Clause = ParseOpenMPSingleExprWithArgClause(DKind, CKind, WrongDirective); break; case OMPC_nowait: case OMPC_untied: @@ -2677,7 +2679,11 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPClauseKind Kind, bool ParseOnly) { /// defaultmap: /// 'defaultmap' '(' modifier ':' kind ')' /// -OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPClauseKind Kind, +/// device-clause: +/// 'device' '(' [ device-modifier ':' ] expression ')' +/// +OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPDirectiveKind DKind, + OpenMPClauseKind Kind, bool ParseOnly) { SourceLocation Loc = ConsumeToken(); SourceLocation DelimLoc; @@ -2771,6 +2777,21 @@ OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPClauseKind Kind, if (Tok.isNot(tok::r_paren) && Tok.isNot(tok::comma) && Tok.isNot(tok::annot_pragma_openmp_end)) ConsumeAnyToken(); + } else if (Kind == OMPC_device) { + // Only target executable directives support extended device construct. + if (isOpenMPTargetExecutionDirective(DKind) && getLangOpts().OpenMP >= 50 && + NextToken().is(tok::colon)) { + // Parse optional ':' + Arg.push_back(getOpenMPSimpleClauseType( + Kind, Tok.isAnnotation() ? "" : PP.getSpelling(Tok))); + KLoc.push_back(Tok.getLocation()); + ConsumeAnyToken(); + // Parse ':' + ConsumeAnyToken(); + } else { + Arg.push_back(OMPC_DEVICE_unknown); + KLoc.emplace_back(); + } } else { assert(Kind == OMPC_if); KLoc.push_back(Tok.getLocation()); @@ -2793,7 +2814,7 @@ OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPClauseKind Kind, bool NeedAnExpression = (Kind == OMPC_schedule && DelimLoc.isValid()) || (Kind == OMPC_dist_schedule && DelimLoc.isValid()) || - Kind == OMPC_if; + Kind == OMPC_if || Kind == OMPC_device; if (NeedAnExpression) { SourceLocation ELoc = Tok.getLocation(); ExprResult LHS(ParseCastExpression(AnyCastExpr, false, NotTypeCast)); @@ -2855,11 +2876,12 @@ static bool ParseReductionId(Parser &P, CXXScopeSpec &ReductionIdScopeSpec, return false; } } - return P.ParseUnqualifiedId(ReductionIdScopeSpec, /*EnteringContext*/ false, - /*AllowDestructorName*/ false, - /*AllowConstructorName*/ false, - /*AllowDeductionGuide*/ false, - nullptr, nullptr, ReductionId); + return P.ParseUnqualifiedId( + ReductionIdScopeSpec, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext*/ false, + /*AllowDestructorName*/ false, + /*AllowConstructorName*/ false, + /*AllowDeductionGuide*/ false, nullptr, ReductionId); } /// Checks if the token is a valid map-type-modifier. @@ -2887,6 +2909,7 @@ bool Parser::parseMapperModifier(OpenMPVarListDataTy &Data) { if (getLangOpts().CPlusPlus) ParseOptionalCXXScopeSpecifier(Data.ReductionOrMapperIdScopeSpec, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/false); if (Tok.isNot(tok::identifier) && Tok.isNot(tok::kw_default)) { Diag(Tok.getLocation(), diag::err_omp_mapper_illegal_identifier); @@ -2992,6 +3015,7 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind, if (getLangOpts().CPlusPlus) ParseOptionalCXXScopeSpecifier(Data.ReductionOrMapperIdScopeSpec, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/false); InvalidReductionId = ParseReductionId( *this, Data.ReductionOrMapperIdScopeSpec, UnqualifiedReductionId); diff --git a/clang/lib/Parse/ParseStmtAsm.cpp b/clang/lib/Parse/ParseStmtAsm.cpp index 2e369448ab6a64..262def2b38a1a9 100644 --- a/clang/lib/Parse/ParseStmtAsm.cpp +++ b/clang/lib/Parse/ParseStmtAsm.cpp @@ -220,9 +220,10 @@ ExprResult Parser::ParseMSAsmIdentifier(llvm::SmallVectorImpl &LineToks, // Parse an optional scope-specifier if we're in C++. CXXScopeSpec SS; - if (getLangOpts().CPlusPlus) { - ParseOptionalCXXScopeSpecifier(SS, nullptr, /*EnteringContext=*/false); - } + if (getLangOpts().CPlusPlus) + ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + /*EnteringContext=*/false); // Require an identifier here. SourceLocation TemplateKWLoc; @@ -233,12 +234,13 @@ ExprResult Parser::ParseMSAsmIdentifier(llvm::SmallVectorImpl &LineToks, Result = ParseCXXThis(); Invalid = false; } else { - Invalid = ParseUnqualifiedId(SS, - /*EnteringContext=*/false, - /*AllowDestructorName=*/false, - /*AllowConstructorName=*/false, - /*AllowDeductionGuide=*/false, - /*ObjectType=*/nullptr, &TemplateKWLoc, Id); + Invalid = + ParseUnqualifiedId(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + /*EnteringContext=*/false, + /*AllowDestructorName=*/false, + /*AllowConstructorName=*/false, + /*AllowDeductionGuide=*/false, &TemplateKWLoc, Id); // Perform the lookup. Result = Actions.LookupInlineAsmIdentifier(SS, TemplateKWLoc, Id, IsUnevaluatedContext); diff --git a/clang/lib/Parse/ParseTemplate.cpp b/clang/lib/Parse/ParseTemplate.cpp index 0406820f74a334..802fe35d4f62aa 100644 --- a/clang/lib/Parse/ParseTemplate.cpp +++ b/clang/lib/Parse/ParseTemplate.cpp @@ -363,9 +363,11 @@ Parser::ParseConceptDefinition(const ParsedTemplateInfo &TemplateInfo, DiagnoseAndSkipCXX11Attributes(); CXXScopeSpec SS; - if (ParseOptionalCXXScopeSpecifier(SS, ParsedType(), - /*EnteringContext=*/false, /*MayBePseudoDestructor=*/nullptr, - /*IsTypename=*/false, /*LastII=*/nullptr, /*OnlyNamespace=*/true) || + if (ParseOptionalCXXScopeSpecifier( + SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/false, + /*MayBePseudoDestructor=*/nullptr, + /*IsTypename=*/false, /*LastII=*/nullptr, /*OnlyNamespace=*/true) || SS.isInvalid()) { SkipUntil(tok::semi); return nullptr; @@ -376,12 +378,12 @@ Parser::ParseConceptDefinition(const ParsedTemplateInfo &TemplateInfo, diag::err_concept_definition_not_identifier); UnqualifiedId Result; - if (ParseUnqualifiedId(SS, /*EnteringContext=*/false, + if (ParseUnqualifiedId(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/false, /*AllowDestructorName=*/false, /*AllowConstructorName=*/false, /*AllowDeductionGuide=*/false, - /*ObjectType=*/ParsedType(), /*TemplateKWLoc=*/nullptr, - Result)) { + /*TemplateKWLoc=*/nullptr, Result)) { SkipUntil(tok::semi); return nullptr; } @@ -682,19 +684,19 @@ bool Parser::TryAnnotateTypeConstraint() { return false; CXXScopeSpec SS; bool WasScopeAnnotation = Tok.is(tok::annot_cxxscope); - if (ParseOptionalCXXScopeSpecifier( - SS, ParsedType(), - /*EnteringContext=*/false, - /*MayBePseudoDestructor=*/nullptr, - // If this is not a type-constraint, then - // this scope-spec is part of the typename - // of a non-type template parameter - /*IsTypename=*/true, /*LastII=*/nullptr, - // We won't find concepts in - // non-namespaces anyway, so might as well - // parse this correctly for possible type - // names. - /*OnlyNamespace=*/false)) + if (ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + /*EnteringContext=*/false, + /*MayBePseudoDestructor=*/nullptr, + // If this is not a type-constraint, then + // this scope-spec is part of the typename + // of a non-type template parameter + /*IsTypename=*/true, /*LastII=*/nullptr, + // We won't find concepts in + // non-namespaces anyway, so might as well + // parse this correctly for possible type + // names. + /*OnlyNamespace=*/false)) return true; if (Tok.is(tok::identifier)) { @@ -754,7 +756,8 @@ NamedDecl *Parser::ParseTypeParameter(unsigned Depth, unsigned Position) { TemplateIdAnnotation *TypeConstraint = nullptr; bool TypenameKeyword = false; SourceLocation KeyLoc; - ParseOptionalCXXScopeSpecifier(TypeConstraintSS, nullptr, + ParseOptionalCXXScopeSpecifier(TypeConstraintSS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext*/ false); if (Tok.is(tok::annot_template_id)) { // Consume the 'type-constraint'. @@ -1438,7 +1441,8 @@ ParsedTemplateArgument Parser::ParseTemplateTemplateArgument() { // followed by a token that terminates a template argument, such as ',', // '>', or (in some cases) '>>'. CXXScopeSpec SS; // nested-name-specifier, if present - ParseOptionalCXXScopeSpecifier(SS, nullptr, + ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/false); ParsedTemplateArgument Result; diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp index 27cb8a2a5e762f..0a63ac2d5e1bce 100644 --- a/clang/lib/Parse/Parser.cpp +++ b/clang/lib/Parse/Parser.cpp @@ -1605,7 +1605,9 @@ Parser::TryAnnotateName(CorrectionCandidateCallback *CCC) { CXXScopeSpec SS; if (getLangOpts().CPlusPlus && - ParseOptionalCXXScopeSpecifier(SS, nullptr, EnteringContext)) + ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + EnteringContext)) return ANK_Error; if (Tok.isNot(tok::identifier) || SS.isInvalid()) { @@ -1842,6 +1844,7 @@ bool Parser::TryAnnotateTypeOrScopeToken() { SourceLocation TypenameLoc = ConsumeToken(); CXXScopeSpec SS; if (ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/false, nullptr, /*IsTypename*/ true)) return true; @@ -1914,7 +1917,9 @@ bool Parser::TryAnnotateTypeOrScopeToken() { CXXScopeSpec SS; if (getLangOpts().CPlusPlus) - if (ParseOptionalCXXScopeSpecifier(SS, nullptr, /*EnteringContext*/false)) + if (ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + /*EnteringContext*/ false)) return true; return TryAnnotateTypeOrScopeTokenAfterScopeSpec(SS, !WasScopeAnnotation); @@ -2043,7 +2048,9 @@ bool Parser::TryAnnotateCXXScopeToken(bool EnteringContext) { assert(MightBeCXXScopeToken() && "Cannot be a type or scope token!"); CXXScopeSpec SS; - if (ParseOptionalCXXScopeSpecifier(SS, nullptr, EnteringContext)) + if (ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + EnteringContext)) return true; if (SS.isEmpty()) return false; @@ -2152,7 +2159,8 @@ bool Parser::ParseMicrosoftIfExistsCondition(IfExistsCondition& Result) { // Parse nested-name-specifier. if (getLangOpts().CPlusPlus) - ParseOptionalCXXScopeSpecifier(Result.SS, nullptr, + ParseOptionalCXXScopeSpecifier(Result.SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/false); // Check nested-name specifier. @@ -2163,10 +2171,12 @@ bool Parser::ParseMicrosoftIfExistsCondition(IfExistsCondition& Result) { // Parse the unqualified-id. SourceLocation TemplateKWLoc; // FIXME: parsed, but unused. - if (ParseUnqualifiedId( - Result.SS, /*EnteringContext*/false, /*AllowDestructorName*/true, - /*AllowConstructorName*/true, /*AllowDeductionGuide*/false, nullptr, - &TemplateKWLoc, Result.Name)) { + if (ParseUnqualifiedId(Result.SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext*/ false, + /*AllowDestructorName*/ true, + /*AllowConstructorName*/ true, + /*AllowDeductionGuide*/ false, &TemplateKWLoc, + Result.Name)) { T.skipToEnd(); return true; } diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 67b7fa6cb46f25..393fdcb479d531 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -4980,6 +4980,17 @@ static bool ArmCdeAliasValid(unsigned BuiltinID, StringRef AliasName) { return ArmBuiltinAliasValid(BuiltinID, AliasName, Map, IntrinNames); } +static bool ArmSveAliasValid(unsigned BuiltinID, StringRef AliasName) { + switch (BuiltinID) { + default: + return false; +#define GET_SVE_BUILTINS +#define BUILTIN(name, types, attr) case SVE::BI##name: +#include "clang/Basic/arm_sve_builtins.inc" + return true; + } +} + static void handleArmBuiltinAliasAttr(Sema &S, Decl *D, const ParsedAttr &AL) { if (!AL.isArgIdent(0)) { S.Diag(AL.getLoc(), diag::err_attribute_argument_n_type) @@ -4991,8 +5002,10 @@ static void handleArmBuiltinAliasAttr(Sema &S, Decl *D, const ParsedAttr &AL) { unsigned BuiltinID = Ident->getBuiltinID(); StringRef AliasName = cast(D)->getIdentifier()->getName(); - if (!ArmMveAliasValid(BuiltinID, AliasName) && - !ArmCdeAliasValid(BuiltinID, AliasName)) { + bool IsAArch64 = S.Context.getTargetInfo().getTriple().isAArch64(); + if ((IsAArch64 && !ArmSveAliasValid(BuiltinID, AliasName)) || + (!IsAArch64 && !ArmMveAliasValid(BuiltinID, AliasName) && + !ArmCdeAliasValid(BuiltinID, AliasName))) { S.Diag(AL.getLoc(), diag::err_attribute_arm_builtin_alias); return; } diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index e6afe7a5b42126..c9de06ce76cb92 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -17425,7 +17425,10 @@ class EvaluatedExprMarker : public UsedDeclVisitor { S.MarkDeclRefReferenced(E); } - void VisitMemberExpr(MemberExpr *E) { S.MarkMemberReferenced(E); } + void VisitMemberExpr(MemberExpr *E) { + S.MarkMemberReferenced(E); + Visit(E->getBase()); + } }; } // namespace diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 8c2f8b2a942ff0..7d0821829daa50 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -11043,9 +11043,6 @@ OMPClause *Sema::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind, Expr *Expr, case OMPC_ordered: Res = ActOnOpenMPOrderedClause(StartLoc, EndLoc, LParenLoc, Expr); break; - case OMPC_device: - Res = ActOnOpenMPDeviceClause(Expr, StartLoc, LParenLoc, EndLoc); - break; case OMPC_num_teams: Res = ActOnOpenMPNumTeamsClause(Expr, StartLoc, LParenLoc, EndLoc); break; @@ -11070,6 +11067,7 @@ OMPClause *Sema::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind, Expr *Expr, case OMPC_detach: Res = ActOnOpenMPDetachClause(Expr, StartLoc, LParenLoc, EndLoc); break; + case OMPC_device: case OMPC_if: case OMPC_default: case OMPC_proc_bind: @@ -12440,6 +12438,12 @@ OMPClause *Sema::ActOnOpenMPSingleExprWithArgClause( StartLoc, LParenLoc, ArgumentLoc[Modifier], ArgumentLoc[DefaultmapKind], EndLoc); break; + case OMPC_device: + assert(Argument.size() == 1 && ArgumentLoc.size() == 1); + Res = ActOnOpenMPDeviceClause( + static_cast(Argument.back()), Expr, + StartLoc, LParenLoc, ArgumentLoc.back(), EndLoc); + break; case OMPC_final: case OMPC_num_threads: case OMPC_safelen: @@ -12477,7 +12481,6 @@ OMPClause *Sema::ActOnOpenMPSingleExprWithArgClause( case OMPC_release: case OMPC_relaxed: case OMPC_depend: - case OMPC_device: case OMPC_threads: case OMPC_simd: case OMPC_map: @@ -15641,16 +15644,32 @@ Sema::ActOnOpenMPDependClause(OpenMPDependClauseKind DepKind, return C; } -OMPClause *Sema::ActOnOpenMPDeviceClause(Expr *Device, SourceLocation StartLoc, +OMPClause *Sema::ActOnOpenMPDeviceClause(OpenMPDeviceClauseModifier Modifier, + Expr *Device, SourceLocation StartLoc, SourceLocation LParenLoc, + SourceLocation ModifierLoc, SourceLocation EndLoc) { + assert((ModifierLoc.isInvalid() || LangOpts.OpenMP >= 50) && + "Unexpected device modifier in OpenMP < 50."); + + bool ErrorFound = false; + if (ModifierLoc.isValid() && Modifier == OMPC_DEVICE_unknown) { + std::string Values = + getListOfPossibleValues(OMPC_device, /*First=*/0, OMPC_DEVICE_unknown); + Diag(ModifierLoc, diag::err_omp_unexpected_clause_value) + << Values << getOpenMPClauseName(OMPC_device); + ErrorFound = true; + } + Expr *ValExpr = Device; Stmt *HelperValStmt = nullptr; // OpenMP [2.9.1, Restrictions] // The device expression must evaluate to a non-negative integer value. - if (!isNonNegativeIntegerValue(ValExpr, *this, OMPC_device, - /*StrictlyPositive=*/false)) + ErrorFound = !isNonNegativeIntegerValue(ValExpr, *this, OMPC_device, + /*StrictlyPositive=*/false) || + ErrorFound; + if (ErrorFound) return nullptr; OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective(); @@ -15663,8 +15682,9 @@ OMPClause *Sema::ActOnOpenMPDeviceClause(Expr *Device, SourceLocation StartLoc, HelperValStmt = buildPreInits(Context, Captures); } - return new (Context) OMPDeviceClause(ValExpr, HelperValStmt, CaptureRegion, - StartLoc, LParenLoc, EndLoc); + return new (Context) + OMPDeviceClause(Modifier, ValExpr, HelperValStmt, CaptureRegion, StartLoc, + LParenLoc, ModifierLoc, EndLoc); } static bool checkTypeMappable(SourceLocation SL, SourceRange SR, Sema &SemaRef, diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index c7e90ad982ef19..bc1a977dbd5d04 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -1847,11 +1847,13 @@ class TreeTransform { /// /// By default, performs semantic analysis to build the new statement. /// Subclasses may override this routine to provide different behavior. - OMPClause *RebuildOMPDeviceClause(Expr *Device, SourceLocation StartLoc, + OMPClause *RebuildOMPDeviceClause(OpenMPDeviceClauseModifier Modifier, + Expr *Device, SourceLocation StartLoc, SourceLocation LParenLoc, + SourceLocation ModifierLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPDeviceClause(Device, StartLoc, LParenLoc, - EndLoc); + return getSema().ActOnOpenMPDeviceClause(Modifier, Device, StartLoc, + LParenLoc, ModifierLoc, EndLoc); } /// Build a new OpenMP 'map' clause. @@ -9256,8 +9258,9 @@ TreeTransform::TransformOMPDeviceClause(OMPDeviceClause *C) { ExprResult E = getDerived().TransformExpr(C->getDevice()); if (E.isInvalid()) return nullptr; - return getDerived().RebuildOMPDeviceClause(E.get(), C->getBeginLoc(), - C->getLParenLoc(), C->getEndLoc()); + return getDerived().RebuildOMPDeviceClause( + C->getModifier(), E.get(), C->getBeginLoc(), C->getLParenLoc(), + C->getModifierLoc(), C->getEndLoc()); } template diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 9f6bfebeabcbe6..dd0fa9f70dafd5 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -12294,7 +12294,9 @@ void OMPClauseReader::VisitOMPDependClause(OMPDependClause *C) { void OMPClauseReader::VisitOMPDeviceClause(OMPDeviceClause *C) { VisitOMPClauseWithPreInit(C); + C->setModifier(Record.readEnum()); C->setDevice(Record.readSubExpr()); + C->setModifierLoc(Record.readSourceLocation()); C->setLParenLoc(Record.readSourceLocation()); } diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp index d74b0d514eda05..1fbcab209303de 100644 --- a/clang/lib/Serialization/ASTReaderStmt.cpp +++ b/clang/lib/Serialization/ASTReaderStmt.cpp @@ -106,7 +106,8 @@ namespace clang { /// The number of record fields required for the Expr class /// itself. - static const unsigned NumExprFields = NumStmtFields + 7; + static const unsigned NumExprFields = + NumStmtFields + ExprDependenceBits + 3; /// Read and initialize a ExplicitTemplateArgumentList structure. void ReadTemplateKWAndArgsInfo(ASTTemplateKWAndArgsInfo &Args, @@ -517,6 +518,7 @@ void ASTStmtReader::VisitExpr(Expr *E) { bool ValueDependent = Record.readInt(); bool InstantiationDependent = Record.readInt(); bool ContainsUnexpandedTemplateParameters = Record.readInt(); + bool ContainsErrors = Record.readInt(); auto Deps = ExprDependence::None; if (TypeDependent) Deps |= ExprDependence::Type; @@ -526,6 +528,8 @@ void ASTStmtReader::VisitExpr(Expr *E) { Deps |= ExprDependence::Instantiation; if (ContainsUnexpandedTemplateParameters) Deps |= ExprDependence::UnexpandedPack; + if (ContainsErrors) + Deps |= ExprDependence::Error; E->setDependence(Deps); E->setValueKind(static_cast(Record.readInt())); diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index b549fe9df01634..c96e46543dbada 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -6376,7 +6376,9 @@ void OMPClauseWriter::VisitOMPDependClause(OMPDependClause *C) { void OMPClauseWriter::VisitOMPDeviceClause(OMPDeviceClause *C) { VisitOMPClauseWithPreInit(C); + Record.writeEnum(C->getModifier()); Record.AddStmt(C->getDevice()); + Record.AddSourceLocation(C->getModifierLoc()); Record.AddSourceLocation(C->getLParenLoc()); } diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index f7d89299e140c8..0fc90f5d835b64 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -2280,6 +2280,7 @@ void ASTWriter::WriteDeclAbbrevs() { Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //ValueDependent Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //InstantiationDependent Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //UnexpandedParamPack + Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // ContainsErrors Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); //GetValueKind Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); //GetObjectKind //DeclRefExpr @@ -2303,6 +2304,7 @@ void ASTWriter::WriteDeclAbbrevs() { Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //ValueDependent Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //InstantiationDependent Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //UnexpandedParamPack + Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // ContainsErrors Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); //GetValueKind Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); //GetObjectKind //Integer Literal @@ -2321,6 +2323,7 @@ void ASTWriter::WriteDeclAbbrevs() { Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //ValueDependent Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //InstantiationDependent Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //UnexpandedParamPack + Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // ContainsErrors Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); //GetValueKind Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); //GetObjectKind //Character Literal @@ -2339,6 +2342,7 @@ void ASTWriter::WriteDeclAbbrevs() { Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //ValueDependent Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //InstantiationDependent Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //UnexpandedParamPack + Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // ContainsErrors Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); //GetValueKind Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); //GetObjectKind // CastExpr diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp index 588977525b65ae..b25b2df8783abb 100644 --- a/clang/lib/Serialization/ASTWriterStmt.cpp +++ b/clang/lib/Serialization/ASTWriterStmt.cpp @@ -540,6 +540,7 @@ void ASTStmtWriter::VisitExpr(Expr *E) { Record.push_back(E->isValueDependent()); Record.push_back(E->isInstantiationDependent()); Record.push_back(E->containsUnexpandedParameterPack()); + Record.push_back(E->containsErrors()); Record.push_back(E->getValueKind()); Record.push_back(E->getObjectKind()); } diff --git a/clang/test/Analysis/debug-CallGraph.cpp b/clang/test/Analysis/debug-CallGraph.cpp index 0f5a83b268a018..5453e2f215337a 100644 --- a/clang/test/Analysis/debug-CallGraph.cpp +++ b/clang/test/Analysis/debug-CallGraph.cpp @@ -81,8 +81,26 @@ namespace Lambdas { } } +namespace CallDecl { + void SomeDecl(); + void SomeOtherDecl(); + void SomeDef() {} + + void Caller() { + SomeDecl(); + SomeOtherDecl(); + } + + void SomeOtherDecl() { + SomeDef(); + } +} + // CHECK:--- Call graph Dump --- -// CHECK-NEXT: {{Function: < root > calls: get5 add test_add mmm foo aaa < > bbb ddd ccc eee fff do_nothing test_single_call SomeNS::templ SomeNS::templ SomeNS::templUser Lambdas::Callee Lambdas::f1 Lambdas::f1\(\)::\(anonymous class\)::operator\(\) Lambdas::f1\(\)::\(anonymous class\)::operator\(\) $}} +// CHECK-NEXT: {{Function: < root > calls: get5 add test_add mmm foo aaa < > bbb ddd ccc eee fff do_nothing test_single_call SomeNS::templ SomeNS::templ SomeNS::templUser Lambdas::Callee Lambdas::f1 Lambdas::f1\(\)::\(anonymous class\)::operator\(\) Lambdas::f1\(\)::\(anonymous class\)::operator\(\) CallDecl::SomeDef CallDecl::Caller CallDecl::SomeOtherDecl $}} +// CHECK-NEXT: {{Function: CallDecl::Caller calls: CallDecl::SomeOtherDecl $}} +// CHECK-NEXT: {{Function: CallDecl::SomeOtherDecl calls: CallDecl::SomeDef $}} +// CHECK-NEXT: {{Function: CallDecl::SomeDef calls: $}} // CHECK-NEXT: {{Function: Lambdas::f1 calls: Lambdas::f1\(\)::\(anonymous class\)::operator\(\) Lambdas::f1\(\)::\(anonymous class\)::operator\(\) $}} // CHECK-NEXT: {{Function: Lambdas::f1\(\)::\(anonymous class\)::operator\(\) calls: Lambdas::Callee $}} // CHECK-NEXT: {{Function: Lambdas::f1\(\)::\(anonymous class\)::operator\(\) calls: Lambdas::Callee $}} diff --git a/clang/test/CodeGen/aarch64-neon-2velem.c b/clang/test/CodeGen/aarch64-neon-2velem.c index 5ad06cfff80866..25c0ae4988b74f 100644 --- a/clang/test/CodeGen/aarch64-neon-2velem.c +++ b/clang/test/CodeGen/aarch64-neon-2velem.c @@ -7,8 +7,10 @@ // CHECK-LABEL: @test_vmla_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] // @@ -18,8 +20,10 @@ int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlaq_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] // @@ -29,8 +33,10 @@ int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmla_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] // @@ -40,8 +46,10 @@ int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlaq_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -51,8 +59,10 @@ int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmla_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] // @@ -62,8 +72,10 @@ int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlaq_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] // @@ -73,8 +85,10 @@ int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmla_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] // @@ -84,8 +98,10 @@ int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlaq_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -95,8 +111,10 @@ int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmls_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] // @@ -106,8 +124,10 @@ int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsq_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] // @@ -117,8 +137,10 @@ int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmls_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] // @@ -128,8 +150,10 @@ int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsq_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -139,8 +163,10 @@ int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmls_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] // @@ -150,8 +176,10 @@ int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsq_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] // @@ -161,8 +189,10 @@ int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmls_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] // @@ -172,8 +202,10 @@ int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsq_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -183,8 +215,10 @@ int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmul_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) { @@ -193,8 +227,10 @@ int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) { // CHECK-LABEL: @test_vmulq_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) { @@ -203,8 +239,10 @@ int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) { // CHECK-LABEL: @test_vmul_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) { @@ -213,8 +251,10 @@ int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) { // CHECK-LABEL: @test_vmulq_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) { @@ -223,8 +263,10 @@ int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vmul_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) { @@ -233,8 +275,10 @@ uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) { // CHECK-LABEL: @test_vmulq_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) { @@ -243,8 +287,10 @@ uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) { // CHECK-LABEL: @test_vmul_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) { @@ -253,8 +299,10 @@ uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) { // CHECK-LABEL: @test_vmulq_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) { @@ -263,8 +311,10 @@ uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) { // CHECK-LABEL: @test_vmul_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) { @@ -273,8 +323,10 @@ int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) { // CHECK-LABEL: @test_vmulq_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) { @@ -283,8 +335,10 @@ int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) { // CHECK-LABEL: @test_vmul_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) { @@ -293,8 +347,10 @@ int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) { // CHECK-LABEL: @test_vmulq_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) { @@ -303,8 +359,10 @@ int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) { // CHECK-LABEL: @test_vmul_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) { @@ -313,8 +371,10 @@ uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) { // CHECK-LABEL: @test_vmulq_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) { @@ -323,8 +383,10 @@ uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) { // CHECK-LABEL: @test_vmul_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) { @@ -333,8 +395,10 @@ uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) { // CHECK-LABEL: @test_vmulq_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) { @@ -584,10 +648,12 @@ float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) { // CHECK-LABEL: @test_vmlal_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -597,10 +663,12 @@ int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlal_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -610,10 +678,12 @@ int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlal_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -623,10 +693,12 @@ int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlal_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -637,10 +709,12 @@ int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlal_high_lane_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -651,10 +725,12 @@ int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlal_high_lane_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -665,10 +741,12 @@ int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlal_high_laneq_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -679,10 +757,12 @@ int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlal_high_laneq_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -692,10 +772,12 @@ int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsl_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -705,10 +787,12 @@ int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsl_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -718,10 +802,12 @@ int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsl_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -731,10 +817,12 @@ int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsl_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -745,10 +833,12 @@ int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsl_high_lane_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -759,10 +849,12 @@ int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsl_high_lane_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -773,10 +865,12 @@ int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsl_high_laneq_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -787,10 +881,12 @@ int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsl_high_laneq_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -800,10 +896,12 @@ int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlal_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -813,10 +911,12 @@ int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlal_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -826,10 +926,12 @@ int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlal_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -839,10 +941,12 @@ int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlal_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -853,10 +957,12 @@ int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlal_high_lane_u16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -867,10 +973,12 @@ int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlal_high_lane_u32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -881,10 +989,12 @@ int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlal_high_laneq_u16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -895,10 +1005,12 @@ int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlal_high_laneq_u32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -908,10 +1020,12 @@ int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsl_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -921,10 +1035,12 @@ int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsl_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -934,10 +1050,12 @@ int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsl_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -947,10 +1065,12 @@ int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsl_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -961,10 +1081,12 @@ int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsl_high_lane_u16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -975,10 +1097,12 @@ int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsl_high_lane_u32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -989,10 +1113,12 @@ int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsl_high_laneq_u16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -1003,10 +1129,12 @@ int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsl_high_laneq_u32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -1016,10 +1144,12 @@ int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmull_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) { @@ -1028,10 +1158,12 @@ int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) { // CHECK-LABEL: @test_vmull_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) { @@ -1040,10 +1172,12 @@ int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) { // CHECK-LABEL: @test_vmull_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) { @@ -1052,10 +1186,12 @@ uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) { // CHECK-LABEL: @test_vmull_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) { @@ -1065,10 +1201,12 @@ uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) { // CHECK-LABEL: @test_vmull_high_lane_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) { @@ -1078,10 +1216,12 @@ int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) { // CHECK-LABEL: @test_vmull_high_lane_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) { @@ -1091,10 +1231,12 @@ int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vmull_high_lane_u16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) { @@ -1104,10 +1246,12 @@ uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) { // CHECK-LABEL: @test_vmull_high_lane_u32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) { @@ -1116,10 +1260,12 @@ uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) { // CHECK-LABEL: @test_vmull_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) { @@ -1128,10 +1274,12 @@ int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) { // CHECK-LABEL: @test_vmull_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) { @@ -1140,10 +1288,12 @@ int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) { // CHECK-LABEL: @test_vmull_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) { @@ -1152,10 +1302,12 @@ uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) { // CHECK-LABEL: @test_vmull_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) { @@ -1165,10 +1317,12 @@ uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) { // CHECK-LABEL: @test_vmull_high_laneq_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) { @@ -1178,10 +1332,12 @@ int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) { // CHECK-LABEL: @test_vmull_high_laneq_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) { @@ -1191,10 +1347,12 @@ int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) { // CHECK-LABEL: @test_vmull_high_laneq_u16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) { @@ -1204,10 +1362,12 @@ uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) { // CHECK-LABEL: @test_vmull_high_laneq_u32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) { @@ -1216,11 +1376,13 @@ uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) { // CHECK-LABEL: @test_vqdmlal_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // @@ -1230,11 +1392,13 @@ int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vqdmlal_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // @@ -1245,11 +1409,13 @@ int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vqdmlal_high_lane_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // @@ -1260,11 +1426,13 @@ int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vqdmlal_high_lane_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // @@ -1274,11 +1442,13 @@ int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vqdmlsl_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // @@ -1288,11 +1458,13 @@ int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vqdmlsl_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // @@ -1303,11 +1475,13 @@ int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vqdmlsl_high_lane_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // @@ -1318,11 +1492,13 @@ int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vqdmlsl_high_lane_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // @@ -1332,10 +1508,12 @@ int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vqdmull_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -1345,10 +1523,12 @@ int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) { // CHECK-LABEL: @test_vqdmull_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -1358,10 +1538,12 @@ int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) { // CHECK-LABEL: @test_vqdmull_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -1371,10 +1553,12 @@ int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) { // CHECK-LABEL: @test_vqdmull_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -1385,10 +1569,12 @@ int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) { // CHECK-LABEL: @test_vqdmull_high_lane_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -1399,10 +1585,12 @@ int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) { // CHECK-LABEL: @test_vqdmull_high_lane_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -1413,10 +1601,12 @@ int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vqdmull_high_laneq_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -1427,10 +1617,12 @@ int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) { // CHECK-LABEL: @test_vqdmull_high_laneq_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -1544,8 +1736,10 @@ int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vmul_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x float> [[MUL]] // float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) { @@ -1568,11 +1762,12 @@ float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) { return vmul_lane_f64(a, v, 0); } - // CHECK-LABEL: @test_vmulq_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x float> [[MUL]] // float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) { @@ -1581,8 +1776,10 @@ float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) { // CHECK-LABEL: @test_vmulq_lane_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <1 x double> [[V:%.*]], <1 x double> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x double> [[MUL]] // float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) { @@ -1591,8 +1788,10 @@ float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) { // CHECK-LABEL: @test_vmul_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x float> [[MUL]] // float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) { @@ -1614,11 +1813,12 @@ float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) { return vmul_laneq_f64(a, v, 1); } - // CHECK-LABEL: @test_vmulq_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x float> [[MUL]] // float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) { @@ -1627,8 +1827,10 @@ float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) { // CHECK-LABEL: @test_vmulq_laneq_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x double> [[MUL]] // float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) { @@ -1637,10 +1839,12 @@ float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) { // CHECK-LABEL: @test_vmulx_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4 // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] // float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) { @@ -1649,10 +1853,12 @@ float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) { // CHECK-LABEL: @test_vmulxq_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4 // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] // float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) { @@ -1661,10 +1867,12 @@ float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) { // CHECK-LABEL: @test_vmulxq_lane_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <1 x double> [[V:%.*]], <1 x double> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4 // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] // float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) { @@ -1673,10 +1881,12 @@ float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) { // CHECK-LABEL: @test_vmulx_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4 // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] // float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) { @@ -1685,10 +1895,12 @@ float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) { // CHECK-LABEL: @test_vmulxq_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4 // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] // float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) { @@ -1697,10 +1909,12 @@ float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) { // CHECK-LABEL: @test_vmulxq_laneq_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4 // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] // float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) { @@ -1709,8 +1923,10 @@ float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) { // CHECK-LABEL: @test_vmla_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] // @@ -1720,8 +1936,10 @@ int16x4_t test_vmla_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlaq_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] // @@ -1731,8 +1949,10 @@ int16x8_t test_vmlaq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmla_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] // @@ -1742,8 +1962,10 @@ int32x2_t test_vmla_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlaq_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -1753,8 +1975,10 @@ int32x4_t test_vmlaq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmla_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] // @@ -1764,8 +1988,10 @@ int16x4_t test_vmla_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlaq_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] // @@ -1775,8 +2001,10 @@ int16x8_t test_vmlaq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmla_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] // @@ -1786,8 +2014,10 @@ int32x2_t test_vmla_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlaq_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -1797,8 +2027,10 @@ int32x4_t test_vmlaq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmls_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] // @@ -1808,8 +2040,10 @@ int16x4_t test_vmls_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsq_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] // @@ -1819,8 +2053,10 @@ int16x8_t test_vmlsq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmls_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] // @@ -1830,8 +2066,10 @@ int32x2_t test_vmls_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsq_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -1841,8 +2079,10 @@ int32x4_t test_vmlsq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmls_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] // @@ -1852,8 +2092,10 @@ int16x4_t test_vmls_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsq_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] // @@ -1863,8 +2105,10 @@ int16x8_t test_vmlsq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmls_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] // @@ -1874,8 +2118,10 @@ int32x2_t test_vmls_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsq_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -1885,8 +2131,10 @@ int32x4_t test_vmlsq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmul_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) { @@ -1895,8 +2143,10 @@ int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) { // CHECK-LABEL: @test_vmulq_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) { @@ -1905,8 +2155,10 @@ int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) { // CHECK-LABEL: @test_vmul_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) { @@ -1915,8 +2167,10 @@ int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) { // CHECK-LABEL: @test_vmulq_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) { @@ -1925,8 +2179,10 @@ int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vmul_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) { @@ -1935,8 +2191,10 @@ uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) { // CHECK-LABEL: @test_vmulq_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) { @@ -1945,8 +2203,10 @@ uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) { // CHECK-LABEL: @test_vmul_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) { @@ -1955,8 +2215,10 @@ uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) { // CHECK-LABEL: @test_vmulq_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) { @@ -1965,8 +2227,10 @@ uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) { // CHECK-LABEL: @test_vmul_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) { @@ -1975,8 +2239,10 @@ int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) { // CHECK-LABEL: @test_vmulq_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) { @@ -1985,8 +2251,10 @@ int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) { // CHECK-LABEL: @test_vmul_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) { @@ -1995,8 +2263,10 @@ int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) { // CHECK-LABEL: @test_vmulq_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) { @@ -2005,8 +2275,10 @@ int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) { // CHECK-LABEL: @test_vmul_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) { @@ -2015,8 +2287,10 @@ uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) { // CHECK-LABEL: @test_vmulq_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) { @@ -2025,8 +2299,10 @@ uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) { // CHECK-LABEL: @test_vmul_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) { @@ -2035,8 +2311,10 @@ uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) { // CHECK-LABEL: @test_vmulq_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) { @@ -2210,10 +2488,12 @@ float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) // CHECK-LABEL: @test_vmlal_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2223,10 +2503,12 @@ int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlal_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2236,10 +2518,12 @@ int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlal_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2249,10 +2533,12 @@ int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlal_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2263,10 +2549,12 @@ int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlal_high_lane_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2277,10 +2565,12 @@ int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlal_high_lane_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2291,10 +2581,12 @@ int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlal_high_laneq_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2305,10 +2597,12 @@ int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlal_high_laneq_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2318,10 +2612,12 @@ int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsl_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2331,10 +2627,12 @@ int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsl_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2344,10 +2642,12 @@ int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsl_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2357,10 +2657,12 @@ int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsl_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2371,10 +2673,12 @@ int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsl_high_lane_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2385,10 +2689,12 @@ int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsl_high_lane_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2399,10 +2705,12 @@ int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsl_high_laneq_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2413,10 +2721,12 @@ int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsl_high_laneq_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2426,10 +2736,12 @@ int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlal_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2439,10 +2751,12 @@ int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlal_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2452,10 +2766,12 @@ int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlal_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2465,10 +2781,12 @@ int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlal_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2479,10 +2797,12 @@ int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlal_high_lane_u16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2493,10 +2813,12 @@ int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlal_high_lane_u32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2507,10 +2829,12 @@ int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlal_high_laneq_u16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2521,10 +2845,12 @@ int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlal_high_laneq_u32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2534,10 +2860,12 @@ int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsl_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2547,10 +2875,12 @@ int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsl_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2560,10 +2890,12 @@ int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsl_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2573,10 +2905,12 @@ int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsl_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2587,10 +2921,12 @@ int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsl_high_lane_u16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2601,10 +2937,12 @@ int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsl_high_lane_u32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2615,10 +2953,12 @@ int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsl_high_laneq_u16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2629,10 +2969,12 @@ int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsl_high_laneq_u32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2642,10 +2984,12 @@ int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmull_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) { @@ -2654,10 +2998,12 @@ int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) { // CHECK-LABEL: @test_vmull_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) { @@ -2666,10 +3012,12 @@ int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) { // CHECK-LABEL: @test_vmull_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) { @@ -2678,10 +3026,12 @@ uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) { // CHECK-LABEL: @test_vmull_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) { @@ -2691,10 +3041,12 @@ uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) { // CHECK-LABEL: @test_vmull_high_lane_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) { @@ -2704,10 +3056,12 @@ int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) { // CHECK-LABEL: @test_vmull_high_lane_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) { @@ -2717,10 +3071,12 @@ int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vmull_high_lane_u16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) { @@ -2730,10 +3086,12 @@ uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) { // CHECK-LABEL: @test_vmull_high_lane_u32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) { @@ -2742,10 +3100,12 @@ uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) { // CHECK-LABEL: @test_vmull_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) { @@ -2754,10 +3114,12 @@ int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) { // CHECK-LABEL: @test_vmull_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) { @@ -2766,10 +3128,12 @@ int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) { // CHECK-LABEL: @test_vmull_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) { @@ -2778,10 +3142,12 @@ uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) { // CHECK-LABEL: @test_vmull_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) { @@ -2791,10 +3157,12 @@ uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) { // CHECK-LABEL: @test_vmull_high_laneq_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) { @@ -2804,10 +3172,12 @@ int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) { // CHECK-LABEL: @test_vmull_high_laneq_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) { @@ -2817,10 +3187,12 @@ int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) { // CHECK-LABEL: @test_vmull_high_laneq_u16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) { @@ -2830,10 +3202,12 @@ uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) { // CHECK-LABEL: @test_vmull_high_laneq_u32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) { @@ -2842,11 +3216,13 @@ uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) { // CHECK-LABEL: @test_vqdmlal_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // @@ -2856,11 +3232,13 @@ int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vqdmlal_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // @@ -2871,11 +3249,13 @@ int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vqdmlal_high_lane_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // @@ -2886,11 +3266,13 @@ int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vqdmlal_high_lane_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // @@ -2900,11 +3282,13 @@ int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vqdmlsl_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // @@ -2914,11 +3298,13 @@ int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vqdmlsl_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // @@ -2929,11 +3315,13 @@ int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vqdmlsl_high_lane_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // @@ -2944,11 +3332,13 @@ int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vqdmlsl_high_lane_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // @@ -2958,10 +3348,12 @@ int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vqdmull_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -2971,10 +3363,12 @@ int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) { // CHECK-LABEL: @test_vqdmull_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -2984,10 +3378,12 @@ int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) { // CHECK-LABEL: @test_vqdmull_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -2997,10 +3393,12 @@ int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) { // CHECK-LABEL: @test_vqdmull_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -3011,10 +3409,12 @@ int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) { // CHECK-LABEL: @test_vqdmull_high_lane_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -3025,10 +3425,12 @@ int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) { // CHECK-LABEL: @test_vqdmull_high_lane_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -3039,10 +3441,12 @@ int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vqdmull_high_laneq_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -3053,10 +3457,12 @@ int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) { // CHECK-LABEL: @test_vqdmull_high_laneq_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -3170,8 +3576,10 @@ int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vmul_lane_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x float> [[MUL]] // float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) { @@ -3180,8 +3588,10 @@ float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) { // CHECK-LABEL: @test_vmulq_lane_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x float> [[MUL]] // float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) { @@ -3190,8 +3600,10 @@ float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) { // CHECK-LABEL: @test_vmul_laneq_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x float> [[MUL]] // float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) { @@ -3215,8 +3627,10 @@ float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) { // CHECK-LABEL: @test_vmulq_laneq_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x float> [[MUL]] // float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) { @@ -3225,8 +3639,10 @@ float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) { // CHECK-LABEL: @test_vmulq_laneq_f64_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x double> [[MUL]] // float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) { @@ -3235,10 +3651,12 @@ float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) { // CHECK-LABEL: @test_vmulx_lane_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4 // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] // float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) { @@ -3247,10 +3665,12 @@ float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) { // CHECK-LABEL: @test_vmulxq_lane_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4 // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] // float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) { @@ -3259,10 +3679,12 @@ float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) { // CHECK-LABEL: @test_vmulxq_lane_f64_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <1 x double> [[V:%.*]], <1 x double> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4 // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] // float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) { @@ -3271,10 +3693,12 @@ float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) { // CHECK-LABEL: @test_vmulx_laneq_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4 // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] // float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) { @@ -3283,10 +3707,12 @@ float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) { // CHECK-LABEL: @test_vmulxq_laneq_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4 // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] // float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) { @@ -3295,10 +3721,12 @@ float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) { // CHECK-LABEL: @test_vmulxq_laneq_f64_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4 // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] // float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) { @@ -4461,8 +4889,10 @@ int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) { // CHECK-LABEL: @test_vmla_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] // @@ -4472,8 +4902,10 @@ uint16x4_t test_vmla_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) { // CHECK-LABEL: @test_vmlaq_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] // @@ -4483,8 +4915,10 @@ uint16x8_t test_vmlaq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) { // CHECK-LABEL: @test_vmla_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] // @@ -4494,8 +4928,10 @@ uint32x2_t test_vmla_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) { // CHECK-LABEL: @test_vmlaq_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -4505,8 +4941,10 @@ uint32x4_t test_vmlaq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) { // CHECK-LABEL: @test_vmla_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] // @@ -4516,8 +4954,10 @@ uint16x4_t test_vmla_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) { // CHECK-LABEL: @test_vmlaq_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] // @@ -4527,8 +4967,10 @@ uint16x8_t test_vmlaq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) { // CHECK-LABEL: @test_vmla_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] // @@ -4538,8 +4980,10 @@ uint32x2_t test_vmla_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) { // CHECK-LABEL: @test_vmlaq_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -4549,11 +4993,13 @@ uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) { // CHECK-LABEL: @test_vqdmlal_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // @@ -4563,11 +5009,13 @@ int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vqdmlal_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // @@ -4578,11 +5026,13 @@ int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vqdmlal_high_laneq_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // @@ -4593,11 +5043,13 @@ int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vqdmlal_high_laneq_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // @@ -4607,8 +5059,10 @@ int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmls_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] // @@ -4618,8 +5072,10 @@ uint16x4_t test_vmls_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) { // CHECK-LABEL: @test_vmlsq_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] // @@ -4629,8 +5085,10 @@ uint16x8_t test_vmlsq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) { // CHECK-LABEL: @test_vmls_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] // @@ -4640,8 +5098,10 @@ uint32x2_t test_vmls_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) { // CHECK-LABEL: @test_vmlsq_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -4651,8 +5111,10 @@ uint32x4_t test_vmlsq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) { // CHECK-LABEL: @test_vmls_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] // @@ -4662,8 +5124,10 @@ uint16x4_t test_vmls_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) { // CHECK-LABEL: @test_vmlsq_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] // @@ -4673,8 +5137,10 @@ uint16x8_t test_vmlsq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) { // CHECK-LABEL: @test_vmls_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] // @@ -4684,8 +5150,10 @@ uint32x2_t test_vmls_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) { // CHECK-LABEL: @test_vmlsq_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -4695,11 +5163,13 @@ uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) { // CHECK-LABEL: @test_vqdmlsl_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // @@ -4709,11 +5179,13 @@ int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vqdmlsl_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // @@ -4724,11 +5196,13 @@ int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // @@ -4739,11 +5213,13 @@ int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // @@ -4857,8 +5333,10 @@ int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) { // CHECK-LABEL: @test_vmla_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] // @@ -4868,8 +5346,10 @@ uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) { // CHECK-LABEL: @test_vmlaq_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] // @@ -4879,8 +5359,10 @@ uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) { // CHECK-LABEL: @test_vmla_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] // @@ -4890,8 +5372,10 @@ uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) { // CHECK-LABEL: @test_vmlaq_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -4901,8 +5385,10 @@ uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) { // CHECK-LABEL: @test_vmla_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] // @@ -4912,8 +5398,10 @@ uint16x4_t test_vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) { // CHECK-LABEL: @test_vmlaq_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] // @@ -4923,8 +5411,10 @@ uint16x8_t test_vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) { // CHECK-LABEL: @test_vmla_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] // @@ -4934,8 +5424,10 @@ uint32x2_t test_vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) { // CHECK-LABEL: @test_vmlaq_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -4945,11 +5437,13 @@ uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) { // CHECK-LABEL: @test_vqdmlal_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // @@ -4959,11 +5453,13 @@ int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vqdmlal_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // @@ -4974,11 +5470,13 @@ int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vqdmlal_high_laneq_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // @@ -4989,11 +5487,13 @@ int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vqdmlal_high_laneq_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // @@ -5003,8 +5503,10 @@ int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmls_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] // @@ -5014,8 +5516,10 @@ uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) { // CHECK-LABEL: @test_vmlsq_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] // @@ -5025,8 +5529,10 @@ uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) { // CHECK-LABEL: @test_vmls_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] // @@ -5036,8 +5542,10 @@ uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) { // CHECK-LABEL: @test_vmlsq_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -5047,8 +5555,10 @@ uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) { // CHECK-LABEL: @test_vmls_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] // @@ -5058,8 +5568,10 @@ uint16x4_t test_vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) { // CHECK-LABEL: @test_vmlsq_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] // @@ -5069,8 +5581,10 @@ uint16x8_t test_vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) { // CHECK-LABEL: @test_vmls_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] // @@ -5080,8 +5594,10 @@ uint32x2_t test_vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) { // CHECK-LABEL: @test_vmlsq_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -5091,11 +5607,13 @@ uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) { // CHECK-LABEL: @test_vqdmlsl_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // @@ -5105,11 +5623,13 @@ int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vqdmlsl_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // @@ -5120,11 +5640,13 @@ int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // @@ -5135,11 +5657,13 @@ int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // diff --git a/clang/test/CodeGen/aarch64-neon-fma.c b/clang/test/CodeGen/aarch64-neon-fma.c index ae02bfbffb985b..c2dd315ed9fc4f 100644 --- a/clang/test/CodeGen/aarch64-neon-fma.c +++ b/clang/test/CodeGen/aarch64-neon-fma.c @@ -69,144 +69,177 @@ float64x2_t test_vmlsq_n_f64(float64x2_t a, float64x2_t b, float64_t c) { } // CHECK-LABEL: define <2 x float> @test_vmla_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]] -// CHECK: ret <2 x float> [[ADD]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <2 x float> [[ADD]] float32x2_t test_vmla_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) { return vmla_lane_f32(a, b, v, 0); } // CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]] -// CHECK: ret <4 x float> [[ADD]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <4 x float> [[ADD]] float32x4_t test_vmlaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) { return vmlaq_lane_f32(a, b, v, 0); } // CHECK-LABEL: define <2 x float> @test_vmla_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]] -// CHECK: ret <2 x float> [[ADD]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <2 x float> [[ADD]] float32x2_t test_vmla_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { return vmla_laneq_f32(a, b, v, 0); } // CHECK-LABEL: define <4 x float> @test_vmlaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]] -// CHECK: ret <4 x float> [[ADD]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <4 x float> [[ADD]] float32x4_t test_vmlaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) { return vmlaq_laneq_f32(a, b, v, 0); } // CHECK-LABEL: define <2 x float> @test_vmls_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]] -// CHECK: ret <2 x float> [[SUB]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <2 x float> [[SUB]] float32x2_t test_vmls_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) { return vmls_lane_f32(a, b, v, 0); } // CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]] -// CHECK: ret <4 x float> [[SUB]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <4 x float> [[SUB]] float32x4_t test_vmlsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) { return vmlsq_lane_f32(a, b, v, 0); } // CHECK-LABEL: define <2 x float> @test_vmls_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]] -// CHECK: ret <2 x float> [[SUB]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <2 x float> [[SUB]] float32x2_t test_vmls_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { return vmls_laneq_f32(a, b, v, 0); } // CHECK-LABEL: define <4 x float> @test_vmlsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]] -// CHECK: ret <4 x float> [[SUB]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <4 x float> [[SUB]] float32x4_t test_vmlsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) { return vmlsq_laneq_f32(a, b, v, 0); } // CHECK-LABEL: define <2 x float> @test_vmla_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]] -// CHECK: ret <2 x float> [[ADD]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <2 x float> [[ADD]] float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { return vmla_lane_f32(a, b, v, 1); } // CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]] -// CHECK: ret <4 x float> [[ADD]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <4 x float> [[ADD]] float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { return vmlaq_lane_f32(a, b, v, 1); } // CHECK-LABEL: define <2 x float> @test_vmla_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]] -// CHECK: ret <2 x float> [[ADD]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <2 x float> [[ADD]] float32x2_t test_vmla_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { return vmla_laneq_f32(a, b, v, 3); } // CHECK-LABEL: define <4 x float> @test_vmlaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]] -// CHECK: ret <4 x float> [[ADD]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <4 x float> [[ADD]] float32x4_t test_vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { return vmlaq_laneq_f32(a, b, v, 3); } // CHECK-LABEL: define <2 x float> @test_vmls_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]] -// CHECK: ret <2 x float> [[SUB]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <2 x float> [[SUB]] float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { return vmls_lane_f32(a, b, v, 1); } // CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]] -// CHECK: ret <4 x float> [[SUB]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <4 x float> [[SUB]] +// float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { return vmlsq_lane_f32(a, b, v, 1); } // CHECK-LABEL: define <2 x float> @test_vmls_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]] -// CHECK: ret <2 x float> [[SUB]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <2 x float> [[SUB]] float32x2_t test_vmls_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { return vmls_laneq_f32(a, b, v, 3); } // CHECK-LABEL: define <4 x float> @test_vmlsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]] -// CHECK: ret <4 x float> [[SUB]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <4 x float> [[SUB]] float32x4_t test_vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { return vmlsq_laneq_f32(a, b, v, 3); } diff --git a/clang/test/CodeGen/aarch64-poly64.c b/clang/test/CodeGen/aarch64-poly64.c index 8c4ef23bb7e148..b7fb1db9b0ff36 100644 --- a/clang/test/CodeGen/aarch64-poly64.c +++ b/clang/test/CodeGen/aarch64-poly64.c @@ -150,22 +150,28 @@ poly64x2_t test_vmovq_n_p64(poly64_t a) { } // CHECK-LABEL: define <1 x i64> @test_vdup_lane_p64(<1 x i64> %vec) #0 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %vec, <1 x i64> %vec, <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[VEC:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[LANE]] poly64x1_t test_vdup_lane_p64(poly64x1_t vec) { return vdup_lane_p64(vec, 0); } // CHECK-LABEL: define <2 x i64> @test_vdupq_lane_p64(<1 x i64> %vec) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %vec, <1 x i64> %vec, <2 x i32> zeroinitializer -// CHECK: ret <2 x i64> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[VEC:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer +// CHECK: ret <2 x i64> [[LANE]] poly64x2_t test_vdupq_lane_p64(poly64x1_t vec) { return vdupq_lane_p64(vec, 0); } // CHECK-LABEL: define <2 x i64> @test_vdupq_laneq_p64(<2 x i64> %vec) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i64> %vec, <2 x i64> %vec, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> [[VEC:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <2 x i32> +// CHECK: ret <2 x i64> [[LANE]] poly64x2_t test_vdupq_laneq_p64(poly64x2_t vec) { return vdupq_laneq_p64(vec, 1); } diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1_shortform.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1_shortform.c new file mode 100644 index 00000000000000..90258f00de43db --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1_shortform.c @@ -0,0 +1,83 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -emit-llvm -o - %s -D__ARM_FEATURE_SVE | FileCheck %s + +#include +// +// ld1 +// + +svint8_t test_svld1_s8(svbool_t pg, const int8_t *base) +{ + // CHECK-LABEL: test_svld1_s8 + // CHECK: @llvm.masked.load.nxv16i8.p0nxv16i8(* %{{.*}}, i32 1, %{{.*}}, zeroinitializer) + return svld1(pg, base); +} + +svint16_t test_svld1_s16(svbool_t pg, const int16_t *base) +{ + // CHECK-LABEL: test_svld1_s16 + // CHECK: @llvm.masked.load.nxv8i16.p0nxv8i16(* %{{.*}}, i32 1, %{{.*}}, zeroinitializer) + return svld1(pg, base); +} + +svint32_t test_svld1_s32(svbool_t pg, const int32_t *base) +{ + // CHECK-LABEL: test_svld1_s32 + // CHECK: @llvm.masked.load.nxv4i32.p0nxv4i32(* %{{.*}}, i32 1, %{{.*}}, zeroinitializer) + return svld1(pg, base); +} + +svint64_t test_svld1_s64(svbool_t pg, const int64_t *base) +{ + // CHECK-LABEL: test_svld1_s64 + // CHECK: @llvm.masked.load.nxv2i64.p0nxv2i64(* %{{.*}}, i32 1, %{{.*}}, zeroinitializer) + return svld1(pg, base); +} + +svuint8_t test_svld1_u8(svbool_t pg, const uint8_t *base) +{ + // CHECK-LABEL: test_svld1_u8 + // CHECK: @llvm.masked.load.nxv16i8.p0nxv16i8(* %{{.*}}, i32 1, %{{.*}}, zeroinitializer) + return svld1(pg, base); +} + +svuint16_t test_svld1_u16(svbool_t pg, const uint16_t *base) +{ + // CHECK-LABEL: test_svld1_u16 + // CHECK: @llvm.masked.load.nxv8i16.p0nxv8i16(* %{{.*}}, i32 1, %{{.*}}, zeroinitializer) + return svld1(pg, base); +} + +svuint32_t test_svld1_u32(svbool_t pg, const uint32_t *base) +{ + // CHECK-LABEL: test_svld1_u32 + // CHECK: @llvm.masked.load.nxv4i32.p0nxv4i32(* %{{.*}}, i32 1, %{{.*}}, zeroinitializer) + return svld1(pg, base); +} + +svuint64_t test_svld1_u64(svbool_t pg, const uint64_t *base) +{ + // CHECK-LABEL: test_svld1_u64 + // CHECK: @llvm.masked.load.nxv2i64.p0nxv2i64(* %{{.*}}, i32 1, %{{.*}}, zeroinitializer) + return svld1(pg, base); +} + +svfloat16_t test_svld1_f16(svbool_t pg, const float16_t *base) +{ + // CHECK-LABEL: test_svld1_f16 + // CHECK: @llvm.masked.load.nxv8f16.p0nxv8f16(* %{{.*}}, i32 1, %{{.*}}, zeroinitializer) + return svld1(pg, base); +} + +svfloat32_t test_svld1_f32(svbool_t pg, const float32_t *base) +{ + // CHECK-LABEL: test_svld1_f32 + // CHECK: @llvm.masked.load.nxv4f32.p0nxv4f32(* %{{.*}}, i32 1, %{{.*}}, zeroinitializer) + return svld1(pg, base); +} + +svfloat64_t test_svld1_f64(svbool_t pg, const float64_t *base) +{ + // CHECK-LABEL: test_svld1_f64 + // CHECK: @llvm.masked.load.nxv2f64.p0nxv2f64(* %{{.*}}, i32 1, %{{.*}}, zeroinitializer) + return svld1(pg, base); +} diff --git a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c index fc339faa6cdbc7..59b3dfec80cb92 100644 --- a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c +++ b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c @@ -1086,32 +1086,40 @@ float16_t test_vfmsh_laneq_f16(float16_t a, float16_t b, float16x8_t c) { } // CHECK-LABEL: test_vmul_lane_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x half> %a, [[TMP0]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x half> [[A:%.*]], [[LANE]] // CHECK: ret <4 x half> [[MUL]] float16x4_t test_vmul_lane_f16(float16x4_t a, float16x4_t b) { return vmul_lane_f16(a, b, 3); } // CHECK-LABEL: test_vmulq_lane_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <8 x i32> -// CHECK: [[MUL:%.*]] = fmul <8 x half> %a, [[TMP0]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = fmul <8 x half> [[A:%.*]], [[LANE]] // CHECK: ret <8 x half> [[MUL]] float16x8_t test_vmulq_lane_f16(float16x8_t a, float16x4_t b) { - return vmulq_lane_f16(a, b, 7); + return vmulq_lane_f16(a, b, 3); } // CHECK-LABEL: test_vmul_laneq_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <8 x half> %b, <8 x half> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x half> %a, [[TMP0]] +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x half> [[A:%.*]], [[LANE]] // CHECK: ret <4 x half> [[MUL]] float16x4_t test_vmul_laneq_f16(float16x4_t a, float16x8_t b) { return vmul_laneq_f16(a, b, 7); } // CHECK-LABEL: test_vmulq_laneq_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <8 x half> %b, <8 x half> %b, <8 x i32> -// CHECK: [[MUL:%.*]] = fmul <8 x half> %a, [[TMP0]] +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = fmul <8 x half> [[A:%.*]], [[LANE]] // CHECK: ret <8 x half> [[MUL]] float16x8_t test_vmulq_laneq_f16(float16x8_t a, float16x8_t b) { return vmulq_laneq_f16(a, b, 7); @@ -1165,33 +1173,49 @@ float16_t test_vmulh_laneq_f16(float16_t a, float16x8_t b) { } // CHECK-LABEL: test_vmulx_lane_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> %a, <4 x half> [[TMP0]]) -// CHECK: ret <4 x half> [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x half> [[LANE]] to <8 x i8> +// CHECK: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]]) #4 +// CHECK: ret <4 x half> [[VMULX2_I]] float16x4_t test_vmulx_lane_f16(float16x4_t a, float16x4_t b) { return vmulx_lane_f16(a, b, 3); } // CHECK-LABEL: test_vmulxq_lane_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <8 x i32> -// CHECK: [[MUL:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> %a, <8 x half> [[TMP0]]) -// CHECK: ret <8 x half> [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <8 x half> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x half> [[LANE]] to <16 x i8> +// CHECK: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]]) #4 +// CHECK: ret <8 x half> [[VMULX2_I]] float16x8_t test_vmulxq_lane_f16(float16x8_t a, float16x4_t b) { - return vmulxq_lane_f16(a, b, 7); + return vmulxq_lane_f16(a, b, 3); } // CHECK-LABEL: test_vmulx_laneq_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <8 x half> %b, <8 x half> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> %a, <4 x half> [[TMP0]]) -// CHECK: ret <4 x half> [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x half> [[LANE]] to <8 x i8> +// CHECK: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]]) #4 +// CHECK: ret <4 x half> [[VMULX2_I]] float16x4_t test_vmulx_laneq_f16(float16x4_t a, float16x8_t b) { return vmulx_laneq_f16(a, b, 7); } // CHECK-LABEL: test_vmulxq_laneq_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <8 x half> %b, <8 x half> %b, <8 x i32> -// CHECK: [[MUL:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> %a, <8 x half> [[TMP0]]) -// CHECK: ret <8 x half> [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <8 x half> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x half> [[LANE]] to <16 x i8> +// CHECK: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]]) #4 +// CHECK: ret <8 x half> [[VMULX2_I]] float16x8_t test_vmulxq_laneq_f16(float16x8_t a, float16x8_t b) { return vmulxq_laneq_f16(a, b, 7); } @@ -1473,17 +1497,21 @@ float16x8_t test_vdupq_n_f16(float16_t a) { } // CHECK-LABEL: test_vdup_lane_f16 -// CHECK: [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <4 x i32> -// CHECK: ret <4 x half> [[SHFL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> +// CHECK: ret <4 x half> [[LANE]] float16x4_t test_vdup_lane_f16(float16x4_t a) { return vdup_lane_f16(a, 3); } // CHECK-LABEL: test_vdupq_lane_f16 -// CHECK: [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <8 x i32> -// CHECK: ret <8 x half> [[SHFL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> +// CHECK: ret <8 x half> [[LANE]] float16x8_t test_vdupq_lane_f16(float16x4_t a) { - return vdupq_lane_f16(a, 7); + return vdupq_lane_f16(a, 3); } // CHECK-LABEL: @test_vext_f16( diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vqdmlad.c b/clang/test/CodeGen/arm-mve-intrinsics/vqdmlad.c new file mode 100644 index 00000000000000..cd59e6ccdfb75d --- /dev/null +++ b/clang/test/CodeGen/arm-mve-intrinsics/vqdmlad.c @@ -0,0 +1,677 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s + +#include + +// CHECK-LABEL: @test_vqdmladhq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlad.v16i8(<16 x i8> [[INACTIVE:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +int8x16_t test_vqdmladhq_s8(int8x16_t inactive, int8x16_t a, int8x16_t b) { +#ifdef POLYMORPHIC + return vqdmladhq(inactive, a, b); +#else /* POLYMORPHIC */ + return vqdmladhq_s8(inactive, a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmladhq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlad.v8i16(<8 x i16> [[INACTIVE:%.*]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +int16x8_t test_vqdmladhq_s16(int16x8_t inactive, int16x8_t a, int16x8_t b) { +#ifdef POLYMORPHIC + return vqdmladhq(inactive, a, b); +#else /* POLYMORPHIC */ + return vqdmladhq_s16(inactive, a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmladhq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlad.v4i32(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vqdmladhq_s32(int32x4_t inactive, int32x4_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vqdmladhq(inactive, a, b); +#else /* POLYMORPHIC */ + return vqdmladhq_s32(inactive, a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmladhxq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlad.v16i8(<16 x i8> [[INACTIVE:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, i32 0, i32 0) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +int8x16_t test_vqdmladhxq_s8(int8x16_t inactive, int8x16_t a, int8x16_t b) { +#ifdef POLYMORPHIC + return vqdmladhxq(inactive, a, b); +#else /* POLYMORPHIC */ + return vqdmladhxq_s8(inactive, a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmladhxq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlad.v8i16(<8 x i16> [[INACTIVE:%.*]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, i32 0, i32 0) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +int16x8_t test_vqdmladhxq_s16(int16x8_t inactive, int16x8_t a, int16x8_t b) { +#ifdef POLYMORPHIC + return vqdmladhxq(inactive, a, b); +#else /* POLYMORPHIC */ + return vqdmladhxq_s16(inactive, a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmladhxq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlad.v4i32(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, i32 0, i32 0) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vqdmladhxq_s32(int32x4_t inactive, int32x4_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vqdmladhxq(inactive, a, b); +#else /* POLYMORPHIC */ + return vqdmladhxq_s32(inactive, a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmlsdhq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlad.v16i8(<16 x i8> [[INACTIVE:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, i32 0, i32 1) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +int8x16_t test_vqdmlsdhq_s8(int8x16_t inactive, int8x16_t a, int8x16_t b) { +#ifdef POLYMORPHIC + return vqdmlsdhq(inactive, a, b); +#else /* POLYMORPHIC */ + return vqdmlsdhq_s8(inactive, a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmlsdhq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlad.v8i16(<8 x i16> [[INACTIVE:%.*]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, i32 0, i32 1) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +int16x8_t test_vqdmlsdhq_s16(int16x8_t inactive, int16x8_t a, int16x8_t b) { +#ifdef POLYMORPHIC + return vqdmlsdhq(inactive, a, b); +#else /* POLYMORPHIC */ + return vqdmlsdhq_s16(inactive, a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmlsdhq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlad.v4i32(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, i32 0, i32 1) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vqdmlsdhq_s32(int32x4_t inactive, int32x4_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vqdmlsdhq(inactive, a, b); +#else /* POLYMORPHIC */ + return vqdmlsdhq_s32(inactive, a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmlsdhxq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlad.v16i8(<16 x i8> [[INACTIVE:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, i32 0, i32 1) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +int8x16_t test_vqdmlsdhxq_s8(int8x16_t inactive, int8x16_t a, int8x16_t b) { +#ifdef POLYMORPHIC + return vqdmlsdhxq(inactive, a, b); +#else /* POLYMORPHIC */ + return vqdmlsdhxq_s8(inactive, a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmlsdhxq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlad.v8i16(<8 x i16> [[INACTIVE:%.*]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, i32 0, i32 1) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +int16x8_t test_vqdmlsdhxq_s16(int16x8_t inactive, int16x8_t a, int16x8_t b) { +#ifdef POLYMORPHIC + return vqdmlsdhxq(inactive, a, b); +#else /* POLYMORPHIC */ + return vqdmlsdhxq_s16(inactive, a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmlsdhxq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlad.v4i32(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, i32 0, i32 1) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vqdmlsdhxq_s32(int32x4_t inactive, int32x4_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vqdmlsdhxq(inactive, a, b); +#else /* POLYMORPHIC */ + return vqdmlsdhxq_s32(inactive, a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmladhq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlad.v16i8(<16 x i8> [[INACTIVE:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, i32 1, i32 0) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +int8x16_t test_vqrdmladhq_s8(int8x16_t inactive, int8x16_t a, int8x16_t b) { +#ifdef POLYMORPHIC + return vqrdmladhq(inactive, a, b); +#else /* POLYMORPHIC */ + return vqrdmladhq_s8(inactive, a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmladhq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlad.v8i16(<8 x i16> [[INACTIVE:%.*]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, i32 1, i32 0) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +int16x8_t test_vqrdmladhq_s16(int16x8_t inactive, int16x8_t a, int16x8_t b) { +#ifdef POLYMORPHIC + return vqrdmladhq(inactive, a, b); +#else /* POLYMORPHIC */ + return vqrdmladhq_s16(inactive, a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmladhq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlad.v4i32(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, i32 1, i32 0) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vqrdmladhq_s32(int32x4_t inactive, int32x4_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vqrdmladhq(inactive, a, b); +#else /* POLYMORPHIC */ + return vqrdmladhq_s32(inactive, a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmladhxq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlad.v16i8(<16 x i8> [[INACTIVE:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, i32 1, i32 0) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +int8x16_t test_vqrdmladhxq_s8(int8x16_t inactive, int8x16_t a, int8x16_t b) { +#ifdef POLYMORPHIC + return vqrdmladhxq(inactive, a, b); +#else /* POLYMORPHIC */ + return vqrdmladhxq_s8(inactive, a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmladhxq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlad.v8i16(<8 x i16> [[INACTIVE:%.*]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, i32 1, i32 0) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +int16x8_t test_vqrdmladhxq_s16(int16x8_t inactive, int16x8_t a, int16x8_t b) { +#ifdef POLYMORPHIC + return vqrdmladhxq(inactive, a, b); +#else /* POLYMORPHIC */ + return vqrdmladhxq_s16(inactive, a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmladhxq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlad.v4i32(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, i32 1, i32 0) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vqrdmladhxq_s32(int32x4_t inactive, int32x4_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vqrdmladhxq(inactive, a, b); +#else /* POLYMORPHIC */ + return vqrdmladhxq_s32(inactive, a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmlsdhq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlad.v16i8(<16 x i8> [[INACTIVE:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, i32 1, i32 1) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +int8x16_t test_vqrdmlsdhq_s8(int8x16_t inactive, int8x16_t a, int8x16_t b) { +#ifdef POLYMORPHIC + return vqrdmlsdhq(inactive, a, b); +#else /* POLYMORPHIC */ + return vqrdmlsdhq_s8(inactive, a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmlsdhq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlad.v8i16(<8 x i16> [[INACTIVE:%.*]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, i32 1, i32 1) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +int16x8_t test_vqrdmlsdhq_s16(int16x8_t inactive, int16x8_t a, int16x8_t b) { +#ifdef POLYMORPHIC + return vqrdmlsdhq(inactive, a, b); +#else /* POLYMORPHIC */ + return vqrdmlsdhq_s16(inactive, a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmlsdhq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlad.v4i32(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, i32 1, i32 1) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vqrdmlsdhq_s32(int32x4_t inactive, int32x4_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vqrdmlsdhq(inactive, a, b); +#else /* POLYMORPHIC */ + return vqrdmlsdhq_s32(inactive, a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmlsdhxq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlad.v16i8(<16 x i8> [[INACTIVE:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, i32 1, i32 1) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +int8x16_t test_vqrdmlsdhxq_s8(int8x16_t inactive, int8x16_t a, int8x16_t b) { +#ifdef POLYMORPHIC + return vqrdmlsdhxq(inactive, a, b); +#else /* POLYMORPHIC */ + return vqrdmlsdhxq_s8(inactive, a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmlsdhxq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlad.v8i16(<8 x i16> [[INACTIVE:%.*]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, i32 1, i32 1) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +int16x8_t test_vqrdmlsdhxq_s16(int16x8_t inactive, int16x8_t a, int16x8_t b) { +#ifdef POLYMORPHIC + return vqrdmlsdhxq(inactive, a, b); +#else /* POLYMORPHIC */ + return vqrdmlsdhxq_s16(inactive, a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmlsdhxq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlad.v4i32(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, i32 1, i32 1) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vqrdmlsdhxq_s32(int32x4_t inactive, int32x4_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vqrdmlsdhxq(inactive, a, b); +#else /* POLYMORPHIC */ + return vqrdmlsdhxq_s32(inactive, a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmladhq_m_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlad.predicated.v16i8.v16i1(<16 x i8> [[INACTIVE:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, i32 0, i32 0, <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vqdmladhq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqdmladhq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqdmladhq_m_s8(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmladhq_m_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlad.predicated.v8i16.v8i1(<8 x i16> [[INACTIVE:%.*]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, i32 0, i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vqdmladhq_m_s16(int16x8_t inactive, int16x8_t a, int16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqdmladhq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqdmladhq_m_s16(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmladhq_m_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlad.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, i32 0, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vqdmladhq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqdmladhq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqdmladhq_m_s32(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmladhxq_m_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlad.predicated.v16i8.v16i1(<16 x i8> [[INACTIVE:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, i32 0, i32 0, <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vqdmladhxq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqdmladhxq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqdmladhxq_m_s8(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmladhxq_m_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlad.predicated.v8i16.v8i1(<8 x i16> [[INACTIVE:%.*]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, i32 0, i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vqdmladhxq_m_s16(int16x8_t inactive, int16x8_t a, int16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqdmladhxq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqdmladhxq_m_s16(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmladhxq_m_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlad.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, i32 0, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vqdmladhxq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqdmladhxq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqdmladhxq_m_s32(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmlsdhq_m_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlad.predicated.v16i8.v16i1(<16 x i8> [[INACTIVE:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, i32 0, i32 1, <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vqdmlsdhq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqdmlsdhq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqdmlsdhq_m_s8(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmlsdhq_m_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlad.predicated.v8i16.v8i1(<8 x i16> [[INACTIVE:%.*]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, i32 0, i32 1, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vqdmlsdhq_m_s16(int16x8_t inactive, int16x8_t a, int16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqdmlsdhq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqdmlsdhq_m_s16(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmlsdhq_m_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlad.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, i32 0, i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vqdmlsdhq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqdmlsdhq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqdmlsdhq_m_s32(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmlsdhxq_m_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlad.predicated.v16i8.v16i1(<16 x i8> [[INACTIVE:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, i32 0, i32 1, <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vqdmlsdhxq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqdmlsdhxq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqdmlsdhxq_m_s8(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmlsdhxq_m_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlad.predicated.v8i16.v8i1(<8 x i16> [[INACTIVE:%.*]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, i32 0, i32 1, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vqdmlsdhxq_m_s16(int16x8_t inactive, int16x8_t a, int16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqdmlsdhxq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqdmlsdhxq_m_s16(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmlsdhxq_m_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlad.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, i32 0, i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vqdmlsdhxq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqdmlsdhxq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqdmlsdhxq_m_s32(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmladhq_m_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlad.predicated.v16i8.v16i1(<16 x i8> [[INACTIVE:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, i32 1, i32 0, <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vqrdmladhq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqrdmladhq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqrdmladhq_m_s8(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmladhq_m_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlad.predicated.v8i16.v8i1(<8 x i16> [[INACTIVE:%.*]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, i32 1, i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vqrdmladhq_m_s16(int16x8_t inactive, int16x8_t a, int16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqrdmladhq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqrdmladhq_m_s16(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmladhq_m_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlad.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, i32 1, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vqrdmladhq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqrdmladhq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqrdmladhq_m_s32(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmladhxq_m_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlad.predicated.v16i8.v16i1(<16 x i8> [[INACTIVE:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, i32 1, i32 0, <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vqrdmladhxq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqrdmladhxq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqrdmladhxq_m_s8(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmladhxq_m_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlad.predicated.v8i16.v8i1(<8 x i16> [[INACTIVE:%.*]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, i32 1, i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vqrdmladhxq_m_s16(int16x8_t inactive, int16x8_t a, int16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqrdmladhxq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqrdmladhxq_m_s16(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmladhxq_m_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlad.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, i32 1, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vqrdmladhxq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqrdmladhxq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqrdmladhxq_m_s32(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmlsdhq_m_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlad.predicated.v16i8.v16i1(<16 x i8> [[INACTIVE:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 0, i32 1, i32 1, <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vqrdmlsdhq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqrdmlsdhq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqrdmlsdhq_m_s8(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmlsdhq_m_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlad.predicated.v8i16.v8i1(<8 x i16> [[INACTIVE:%.*]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 0, i32 1, i32 1, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vqrdmlsdhq_m_s16(int16x8_t inactive, int16x8_t a, int16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqrdmlsdhq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqrdmlsdhq_m_s16(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmlsdhq_m_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlad.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, i32 1, i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vqrdmlsdhq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqrdmlsdhq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqrdmlsdhq_m_s32(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmlsdhxq_m_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlad.predicated.v16i8.v16i1(<16 x i8> [[INACTIVE:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], i32 1, i32 1, i32 1, <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vqrdmlsdhxq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqrdmlsdhxq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqrdmlsdhxq_m_s8(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmlsdhxq_m_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlad.predicated.v8i16.v8i1(<8 x i16> [[INACTIVE:%.*]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i32 1, i32 1, i32 1, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vqrdmlsdhxq_m_s16(int16x8_t inactive, int16x8_t a, int16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqrdmlsdhxq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqrdmlsdhxq_m_s16(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqrdmlsdhxq_m_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlad.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, i32 1, i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vqrdmlsdhxq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqrdmlsdhxq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vqrdmlsdhxq_m_s32(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} diff --git a/clang/test/CodeGen/arm-neon-range-checks.c b/clang/test/CodeGen/arm-neon-range-checks.c new file mode 100644 index 00000000000000..488dad6d59acd5 --- /dev/null +++ b/clang/test/CodeGen/arm-neon-range-checks.c @@ -0,0 +1,424 @@ +// RUN: %clang_cc1 -triple arm64-none-eabi -target-feature +neon -target-feature +dotprod -target-feature +v8.1a -verify %s +// RUN: %clang_cc1 -triple armv8.1a-none-eabi -target-feature +neon -target-feature +dotprod -target-feature +v8.1a -verify %s + +#include + +void test_vdot_lane(int32x2_t r, int8x8_t a, int8x8_t b) { + vdot_lane_s32(r, a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vdot_lane_s32(r, a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vdot_lane_s32(r, a, b, 0); + vdot_lane_s32(r, a, b, 1); +} + +void test_vdotq_lane(int32x4_t r, int8x16_t a, int8x8_t b) { + vdotq_lane_s32(r, a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vdotq_lane_s32(r, a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vdotq_lane_s32(r, a, b, 0); + vdotq_lane_s32(r, a, b, 1); +} + +#if defined(__aarch64__) +void test_vdot_laneq(int32x2_t r, int8x8_t a, int8x16_t b) { + vdot_laneq_s32(r, a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vdot_laneq_s32(r, a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vdot_laneq_s32(r, a, b, 0); + vdot_laneq_s32(r, a, b, 3); +} + +void test_vdotq_laneq(int32x4_t r, int8x16_t a, int8x16_t b) { + vdotq_laneq_s32(r, a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vdotq_laneq_s32(r, a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vdotq_laneq_s32(r, a, b, 0); + vdotq_laneq_s32(r, a, b, 3); +} +#endif + +void test_vdup_lane(int32x2_t v) { + vdup_lane_s32(v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vdup_lane_s32(v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vdup_lane_s32(v, 0); + vdup_lane_s32(v, 1); +} + +void test_vdupq_lane(int32x2_t v) { + vdupq_lane_s32(v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vdupq_lane_s32(v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vdupq_lane_s32(v, 0); + vdupq_lane_s32(v, 1); +} + +#if defined(__aarch64__) +void test_vdup_laneq(int32x4_t v) { + vdup_laneq_s32(v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vdup_laneq_s32(v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vdup_laneq_s32(v, 0); + vdup_laneq_s32(v, 3); +} + +void test_vdupq_laneq(int32x4_t v) { + vdupq_laneq_s32(v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vdupq_laneq_s32(v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vdupq_laneq_s32(v, 0); + vdupq_laneq_s32(v, 3); +} +#endif + +void test_vmla_lane(int32x2_t a, int32x2_t b, int32x2_t v) { + vmla_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vmla_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vmla_lane_s32(a, b, v, 0); + vmla_lane_s32(a, b, v, 1); +} + +void test_vmlaq_lane(int32x4_t a, int32x4_t b, int32x2_t v) { + vmlaq_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vmlaq_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vmlaq_lane_s32(a, b, v, 0); + vmlaq_lane_s32(a, b, v, 1); +} + +#if defined(__aarch64__) +void test_vmla_laneq(int32x2_t a, int32x2_t b, int32x4_t v) { + vmla_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vmla_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vmla_laneq_s32(a, b, v, 0); + vmla_laneq_s32(a, b, v, 3); +} + +void test_vmlaq_laneq(int32x4_t a, int32x4_t b, int32x4_t v) { + vmlaq_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vmlaq_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vmlaq_laneq_s32(a, b, v, 0); + vmlaq_laneq_s32(a, b, v, 3); +} + +void test_vmlal_high_lane(int64x2_t a, int32x4_t b, int32x2_t v) { + vmlal_high_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vmlal_high_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vmlal_high_lane_s32(a, b, v, 0); + vmlal_high_lane_s32(a, b, v, 1); +} + +void test_vmlal_high_laneq(int64x2_t a, int32x4_t b, int32x4_t v) { + vmlal_high_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vmlal_high_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vmlal_high_laneq_s32(a, b, v, 0); + vmlal_high_laneq_s32(a, b, v, 3); +} +#endif + +void test_vmlal_lane(int64x2_t a, int32x2_t b, int32x2_t v) { + vmlal_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vmlal_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vmlal_lane_s32(a, b, v, 0); + vmlal_lane_s32(a, b, v, 1); +} + +#if defined(__aarch64__) +void test_vmlal_laneq(int64x2_t a, int32x2_t b, int32x4_t v) { + vmlal_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vmlal_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vmlal_laneq_s32(a, b, v, 0); + vmlal_laneq_s32(a, b, v, 3); +} +#endif + +void test_vmls_lane(int32x2_t a, int32x2_t b, int32x2_t v) { + vmls_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vmls_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vmls_lane_s32(a, b, v, 0); + vmls_lane_s32(a, b, v, 1); +} + +void test_vmlsq_lane(int32x4_t a, int32x4_t b, int32x2_t v) { + vmlsq_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vmlsq_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vmlsq_lane_s32(a, b, v, 0); + vmlsq_lane_s32(a, b, v, 1); +} + +#if defined(__aarch64__) +void test_vmls_laneq(int32x2_t a, int32x2_t b, int32x4_t v) { + vmls_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vmls_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vmls_laneq_s32(a, b, v, 0); + vmls_laneq_s32(a, b, v, 3); +} + +void test_vmlsq_laneq(int32x4_t a, int32x4_t b, int32x4_t v) { + vmlsq_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vmlsq_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vmlsq_laneq_s32(a, b, v, 0); + vmlsq_laneq_s32(a, b, v, 3); +} + +void test_vmlsl_high_lane(int64x2_t a, int32x4_t b, int32x2_t v) { + vmlsl_high_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vmlsl_high_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vmlsl_high_lane_s32(a, b, v, 0); + vmlsl_high_lane_s32(a, b, v, 1); +} + +void test_vmlsl_high_laneq(int64x2_t a, int32x4_t b, int32x4_t v) { + vmlsl_high_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vmlsl_high_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vmlsl_high_laneq_s32(a, b, v, 0); + vmlsl_high_laneq_s32(a, b, v, 3); +} +#endif + +void test_vmlsl_lane(int64x2_t a, int32x2_t b, int32x2_t v) { + vmlsl_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vmlsl_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vmlsl_lane_s32(a, b, v, 0); + vmlsl_lane_s32(a, b, v, 1); +} + +#if defined(__aarch64__) +void test_vmlsl_laneq(int64x2_t a, int32x2_t b, int32x4_t v) { + vmlsl_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vmlsl_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vmlsl_laneq_s32(a, b, v, 0); + vmlsl_laneq_s32(a, b, v, 3); +} +#endif + +void test_vmull_lane(int32x2_t a, int32x2_t b) { + vmull_lane_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vmull_lane_s32(a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vmull_lane_s32(a, b, 0); + vmull_lane_s32(a, b, 1); +} + +#if defined(__aarch64__) +void test_vmull_laneq(int32x2_t a, int32x4_t b) { + vmull_laneq_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vmull_laneq_s32(a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vmull_laneq_s32(a, b, 0); + vmull_laneq_s32(a, b, 3); +} + +void test_vmull_high_lane(int32x4_t a, int32x2_t b) { + vmull_high_lane_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vmull_high_lane_s32(a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vmull_high_lane_s32(a, b, 0); + vmull_high_lane_s32(a, b, 1); +} + +void test_vmull_high_laneq(int32x4_t a, int32x4_t b) { + vmull_high_laneq_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vmull_high_laneq_s32(a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vmull_high_laneq_s32(a, b, 0); + vmull_high_laneq_s32(a, b, 3); +} + +void test_vqdmlal_high_lane(int64x2_t a, int32x4_t b, int32x2_t v) { + vqdmlal_high_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqdmlal_high_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqdmlal_high_lane_s32(a, b, v, 0); + vqdmlal_high_lane_s32(a, b, v, 1); +} + +void test_vqdmlal_high_laneq(int64x2_t a, int32x4_t b, int32x4_t v) { + vqdmlal_high_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqdmlal_high_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqdmlal_high_laneq_s32(a, b, v, 0); + vqdmlal_high_laneq_s32(a, b, v, 3); +} +#endif + +void test_vqdmlal_lane(int64x2_t a, int32x2_t b, int32x2_t v) { + vqdmlal_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqdmlal_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqdmlal_lane_s32(a, b, v, 0); + vqdmlal_lane_s32(a, b, v, 1); +} + +#if defined(__aarch64__) +void test_vqdmlal_laneq(int64x2_t a, int32x2_t b, int32x4_t v) { + vqdmlal_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqdmlal_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqdmlal_laneq_s32(a, b, v, 0); + vqdmlal_laneq_s32(a, b, v, 3); +} + +void test_vqdmlsl_high_lane(int64x2_t a, int32x4_t b, int32x2_t v) { + vqdmlsl_high_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqdmlsl_high_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqdmlsl_high_lane_s32(a, b, v, 0); + vqdmlsl_high_lane_s32(a, b, v, 1); +} + +void test_vqdmlsl_high_laneq(int64x2_t a, int32x4_t b, int32x4_t v) { + vqdmlsl_high_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqdmlsl_high_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqdmlsl_high_laneq_s32(a, b, v, 0); + vqdmlsl_high_laneq_s32(a, b, v, 3); +} +#endif + +void test_vqdmlsl_lane(int64x2_t a, int32x2_t b, int32x2_t v) { + vqdmlsl_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqdmlsl_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqdmlsl_lane_s32(a, b, v, 0); + vqdmlsl_lane_s32(a, b, v, 1); +} + +#if defined(__aarch64__) +void test_vqdmlsl_laneq(int64x2_t a, int32x2_t b, int32x4_t v) { + vqdmlsl_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqdmlsl_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqdmlsl_laneq_s32(a, b, v, 0); + vqdmlsl_laneq_s32(a, b, v, 3); +} +#endif + +void test_vqdmulh_lane(int32x2_t a, int32x2_t b) { + vqdmulh_lane_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqdmulh_lane_s32(a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqdmulh_lane_s32(a, b, 0); + vqdmulh_lane_s32(a, b, 1); +} + +void test_vqdmulhq_lane(int32x4_t a, int32x2_t b) { + vqdmulhq_lane_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqdmulhq_lane_s32(a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqdmulhq_lane_s32(a, b, 0); + vqdmulhq_lane_s32(a, b, 1); +} + +#if defined(__aarch64__) +void test_vqdmulh_laneq(int32x2_t a, int32x4_t b) { + vqdmulh_laneq_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqdmulh_laneq_s32(a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqdmulh_laneq_s32(a, b, 0); + vqdmulh_laneq_s32(a, b, 3); +} + +void test_vqdmulhq_laneq(int32x4_t a, int32x4_t b) { + vqdmulhq_laneq_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqdmulhq_laneq_s32(a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqdmulhq_laneq_s32(a, b, 0); + vqdmulhq_laneq_s32(a, b, 3); +} + +void test_vqdmull_high_lane(int32x4_t a, int32x2_t b) { + vqdmull_high_lane_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqdmull_high_lane_s32(a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqdmull_high_lane_s32(a, b, 0); + vqdmull_high_lane_s32(a, b, 1); +} + +void test_vqdmull_high_laneq(int32x4_t a, int32x4_t b) { + vqdmull_high_laneq_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqdmull_high_laneq_s32(a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqdmull_high_laneq_s32(a, b, 0); + vqdmull_high_laneq_s32(a, b, 3); +} +#endif + +void test_vqdmull_lane(int32x2_t a, int32x2_t v) { + vqdmull_lane_s32(a, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqdmull_lane_s32(a, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqdmull_lane_s32(a, v, 0); + vqdmull_lane_s32(a, v, 1); +} + +#if defined(__aarch64__) +void test_vqdmull_laneq(int32x2_t a, int32x4_t v) { + vqdmull_laneq_s32(a, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqdmull_laneq_s32(a, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqdmull_laneq_s32(a, v, 0); + vqdmull_laneq_s32(a, v, 3); +} +#endif + +void test_vqrdmlah_lane(int32x2_t a, int32x2_t b, int32x2_t v) { + vqrdmlah_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqrdmlah_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqrdmlah_lane_s32(a, b, v, 0); + vqrdmlah_lane_s32(a, b, v, 1); +} + +void test_vqrdmlahq_lane(int32x4_t a, int32x4_t b, int32x2_t v) { + vqrdmlahq_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqrdmlahq_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqrdmlahq_lane_s32(a, b, v, 0); + vqrdmlahq_lane_s32(a, b, v, 1); +} + +#if defined(__aarch64__) +void test_vqrdmlah_laneq(int32x2_t a, int32x2_t b, int32x4_t v) { + vqrdmlah_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqrdmlah_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqrdmlah_laneq_s32(a, b, v, 0); + vqrdmlah_laneq_s32(a, b, v, 3); +} + +void test_vqrdmlahq_laneq(int32x4_t a, int32x4_t b, int32x4_t v) { + vqrdmlahq_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqrdmlahq_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqrdmlahq_laneq_s32(a, b, v, 0); + vqrdmlahq_laneq_s32(a, b, v, 3); +} +#endif + +void test_vqrdmlsh_lane(int32x2_t a, int32x2_t b, int32x2_t v) { + vqrdmlsh_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqrdmlsh_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqrdmlsh_lane_s32(a, b, v, 0); + vqrdmlsh_lane_s32(a, b, v, 1); +} + +void test_vqrdmlshq_lane(int32x4_t a, int32x4_t b, int32x2_t v) { + vqrdmlshq_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqrdmlshq_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqrdmlshq_lane_s32(a, b, v, 0); + vqrdmlshq_lane_s32(a, b, v, 1); +} + +#if defined(__aarch64__) +void test_vqrdmlsh_laneq(int32x2_t a, int32x2_t b, int32x4_t v) { + vqrdmlsh_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqrdmlsh_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqrdmlsh_laneq_s32(a, b, v, 0); + vqrdmlsh_laneq_s32(a, b, v, 3); +} + +void test_vqrdmlshq_laneq(int32x4_t a, int32x4_t b, int32x4_t v) { + vqrdmlshq_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqrdmlshq_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqrdmlshq_laneq_s32(a, b, v, 0); + vqrdmlshq_laneq_s32(a, b, v, 3); +} +#endif + +void test_vqrdmulh_lane(int32x2_t a, int32x2_t v) { + vqrdmulh_lane_s32(a, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqrdmulh_lane_s32(a, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqrdmulh_lane_s32(a, v, 0); + vqrdmulh_lane_s32(a, v, 1); +} + +void test_vqrdmulhq_lane(int32x4_t a, int32x2_t v) { + vqrdmulhq_lane_s32(a, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqrdmulhq_lane_s32(a, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqrdmulhq_lane_s32(a, v, 0); + vqrdmulhq_lane_s32(a, v, 1); +} + +#if defined(__aarch64__) +void test_vqrdmulh_laneq(int32x2_t a, int32x4_t v) { + vqrdmulh_laneq_s32(a, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqrdmulh_laneq_s32(a, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqrdmulh_laneq_s32(a, v, 0); + vqrdmulh_laneq_s32(a, v, 3); +} + +void test_vqrdmulhq_laneq(int32x4_t a, int32x4_t v) { + vqrdmulhq_laneq_s32(a, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqrdmulhq_laneq_s32(a, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqrdmulhq_laneq_s32(a, v, 0); + vqrdmulhq_laneq_s32(a, v, 3); +} +#endif diff --git a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c index 4b48ba01c4bcda..a0896c7aa4f229 100644 --- a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c +++ b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c @@ -773,19 +773,23 @@ float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { } // CHECK-LABEL: test_vmul_lane_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x half> %a, [[TMP0]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x half> [[A:%.*]], [[LANE]] // CHECK: ret <4 x half> [[MUL]] float16x4_t test_vmul_lane_f16(float16x4_t a, float16x4_t b) { return vmul_lane_f16(a, b, 3); } // CHECK-LABEL: test_vmulq_lane_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <8 x i32> -// CHECK: [[MUL:%.*]] = fmul <8 x half> %a, [[TMP0]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = fmul <8 x half> [[A:%.*]], [[LANE]] // CHECK: ret <8 x half> [[MUL]] float16x8_t test_vmulq_lane_f16(float16x8_t a, float16x4_t b) { - return vmulq_lane_f16(a, b, 7); + return vmulq_lane_f16(a, b, 3); } // CHECK-LABEL: test_vmul_n_f16 @@ -939,17 +943,21 @@ float16x8_t test_vdupq_n_f16(float16_t a) { } // CHECK-LABEL: test_vdup_lane_f16 -// CHECK: [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <4 x i32> -// CHECK: ret <4 x half> [[SHFL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> +// CHECK: ret <4 x half> [[LANE]] float16x4_t test_vdup_lane_f16(float16x4_t a) { return vdup_lane_f16(a, 3); } // CHECK-LABEL: test_vdupq_lane_f16 -// CHECK: [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <8 x i32> -// CHECK: ret <8 x half> [[SHFL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> +// CHECK: ret <8 x half> [[LANE]] float16x8_t test_vdupq_lane_f16(float16x4_t a) { - return vdupq_lane_f16(a, 7); + return vdupq_lane_f16(a, 3); } // CHECK-LABEL: @test_vext_f16( diff --git a/clang/test/CodeGen/arm64_vdupq_n_f64.c b/clang/test/CodeGen/arm64_vdupq_n_f64.c index 24c57c4f0de3ac..5c6f61e7acf915 100644 --- a/clang/test/CodeGen/arm64_vdupq_n_f64.c +++ b/clang/test/CodeGen/arm64_vdupq_n_f64.c @@ -28,7 +28,9 @@ float32x4_t test_vdupq_n_f32(float32_t w) { // this was in , but had already been implemented, // test anyway // CHECK-LABEL: define <2 x double> @test_vdupq_lane_f64(<1 x double> %V) #0 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x double> %V, <1 x double> %V, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %V to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer // CHECK: ret <2 x double> [[SHUFFLE]] float64x2_t test_vdupq_lane_f64(float64x1_t V) { return vdupq_lane_f64(V, 0); diff --git a/clang/test/CodeGen/arm_neon_intrinsics.c b/clang/test/CodeGen/arm_neon_intrinsics.c index 9f1a64554155cd..2cdbfb9ba26be6 100644 --- a/clang/test/CodeGen/arm_neon_intrinsics.c +++ b/clang/test/CodeGen/arm_neon_intrinsics.c @@ -2419,15 +2419,19 @@ uint8x8_t test_vdup_lane_u8(uint8x8_t a) { } // CHECK-LABEL: @test_vdup_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: ret <4 x i16> [[LANE]] uint16x4_t test_vdup_lane_u16(uint16x4_t a) { return vdup_lane_u16(a, 3); } // CHECK-LABEL: @test_vdup_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: ret <2 x i32> [[LANE]] uint32x2_t test_vdup_lane_u32(uint32x2_t a) { return vdup_lane_u32(a, 1); } @@ -2440,15 +2444,19 @@ int8x8_t test_vdup_lane_s8(int8x8_t a) { } // CHECK-LABEL: @test_vdup_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: ret <4 x i16> [[LANE]] int16x4_t test_vdup_lane_s16(int16x4_t a) { return vdup_lane_s16(a, 3); } // CHECK-LABEL: @test_vdup_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: ret <2 x i32> [[LANE]] int32x2_t test_vdup_lane_s32(int32x2_t a) { return vdup_lane_s32(a, 1); } @@ -2461,15 +2469,19 @@ poly8x8_t test_vdup_lane_p8(poly8x8_t a) { } // CHECK-LABEL: @test_vdup_lane_p16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: ret <4 x i16> [[LANE]] poly16x4_t test_vdup_lane_p16(poly16x4_t a) { return vdup_lane_p16(a, 3); } // CHECK-LABEL: @test_vdup_lane_f32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> -// CHECK: ret <2 x float> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK: ret <2 x float> [[LANE]] float32x2_t test_vdup_lane_f32(float32x2_t a) { return vdup_lane_f32(a, 1); } @@ -2482,15 +2494,19 @@ uint8x16_t test_vdupq_lane_u8(uint8x8_t a) { } // CHECK-LABEL: @test_vdupq_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: ret <8 x i16> [[LANE]] uint16x8_t test_vdupq_lane_u16(uint16x4_t a) { return vdupq_lane_u16(a, 3); } // CHECK-LABEL: @test_vdupq_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: ret <4 x i32> [[LANE]] uint32x4_t test_vdupq_lane_u32(uint32x2_t a) { return vdupq_lane_u32(a, 1); } @@ -2503,15 +2519,19 @@ int8x16_t test_vdupq_lane_s8(int8x8_t a) { } // CHECK-LABEL: @test_vdupq_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: ret <8 x i16> [[LANE]] int16x8_t test_vdupq_lane_s16(int16x4_t a) { return vdupq_lane_s16(a, 3); } // CHECK-LABEL: @test_vdupq_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: ret <4 x i32> [[LANE]] int32x4_t test_vdupq_lane_s32(int32x2_t a) { return vdupq_lane_s32(a, 1); } @@ -2524,43 +2544,55 @@ poly8x16_t test_vdupq_lane_p8(poly8x8_t a) { } // CHECK-LABEL: @test_vdupq_lane_p16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: ret <8 x i16> [[LANE]] poly16x8_t test_vdupq_lane_p16(poly16x4_t a) { return vdupq_lane_p16(a, 3); } // CHECK-LABEL: @test_vdupq_lane_f32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <4 x i32> -// CHECK: ret <4 x float> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK: ret <4 x float> [[LANE]] float32x4_t test_vdupq_lane_f32(float32x2_t a) { return vdupq_lane_f32(a, 1); } // CHECK-LABEL: @test_vdup_lane_s64( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[LANE]] int64x1_t test_vdup_lane_s64(int64x1_t a) { return vdup_lane_s64(a, 0); } // CHECK-LABEL: @test_vdup_lane_u64( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[LANE]] uint64x1_t test_vdup_lane_u64(uint64x1_t a) { return vdup_lane_u64(a, 0); } // CHECK-LABEL: @test_vdupq_lane_s64( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer -// CHECK: ret <2 x i64> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer +// CHECK: ret <2 x i64> [[LANE]] int64x2_t test_vdupq_lane_s64(int64x1_t a) { return vdupq_lane_s64(a, 0); } // CHECK-LABEL: @test_vdupq_lane_u64( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer -// CHECK: ret <2 x i64> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer +// CHECK: ret <2 x i64> [[LANE]] uint64x2_t test_vdupq_lane_u64(uint64x1_t a) { return vdupq_lane_u64(a, 0); } @@ -7077,44 +7109,52 @@ uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { } // CHECK-LABEL: @test_vmlal_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) -// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #8 +// CHECK: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vmlal_lane_s16(a, b, c, 3); } // CHECK-LABEL: @test_vmlal_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) -// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #8 +// CHECK: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK: ret <2 x i64> [[ADD]] int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vmlal_lane_s32(a, b, c, 1); } // CHECK-LABEL: @test_vmlal_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) -// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #8 +// CHECK: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK: ret <4 x i32> [[ADD]] uint32x4_t test_vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vmlal_lane_u16(a, b, c, 3); } // CHECK-LABEL: @test_vmlal_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) -// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #8 +// CHECK: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK: ret <2 x i64> [[ADD]] uint64x2_t test_vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vmlal_lane_u32(a, b, c, 1); @@ -7173,90 +7213,110 @@ uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { } // CHECK-LABEL: @test_vmla_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK: ret <4 x i16> [[ADD]] int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) { return vmla_lane_s16(a, b, c, 3); } // CHECK-LABEL: @test_vmla_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK: ret <2 x i32> [[ADD]] int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) { return vmla_lane_s32(a, b, c, 1); } // CHECK-LABEL: @test_vmla_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK: ret <4 x i16> [[ADD]] uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vmla_lane_u16(a, b, c, 3); } // CHECK-LABEL: @test_vmla_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK: ret <2 x i32> [[ADD]] uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vmla_lane_u32(a, b, c, 1); } // CHECK-LABEL: @test_vmla_lane_f32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]] // CHECK: ret <2 x float> [[ADD]] float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) { return vmla_lane_f32(a, b, c, 1); } // CHECK-LABEL: @test_vmlaq_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK: ret <8 x i16> [[ADD]] int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) { return vmlaq_lane_s16(a, b, c, 3); } // CHECK-LABEL: @test_vmlaq_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) { return vmlaq_lane_s32(a, b, c, 1); } // CHECK-LABEL: @test_vmlaq_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK: ret <8 x i16> [[ADD]] uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) { return vmlaq_lane_u16(a, b, c, 3); } // CHECK-LABEL: @test_vmlaq_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK: ret <4 x i32> [[ADD]] uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) { return vmlaq_lane_u32(a, b, c, 1); } // CHECK-LABEL: @test_vmlaq_lane_f32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]] // CHECK: ret <4 x float> [[ADD]] float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) { return vmlaq_lane_f32(a, b, c, 1); @@ -7553,44 +7613,52 @@ uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { } // CHECK-LABEL: @test_vmlsl_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) -// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #8 +// CHECK: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vmlsl_lane_s16(a, b, c, 3); } // CHECK-LABEL: @test_vmlsl_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) -// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #8 +// CHECK: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK: ret <2 x i64> [[SUB]] int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vmlsl_lane_s32(a, b, c, 1); } // CHECK-LABEL: @test_vmlsl_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) -// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #8 +// CHECK: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK: ret <4 x i32> [[SUB]] uint32x4_t test_vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vmlsl_lane_u16(a, b, c, 3); } // CHECK-LABEL: @test_vmlsl_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) -// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #8 +// CHECK: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK: ret <2 x i64> [[SUB]] uint64x2_t test_vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vmlsl_lane_u32(a, b, c, 1); @@ -7649,90 +7717,110 @@ uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { } // CHECK-LABEL: @test_vmls_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK: ret <4 x i16> [[SUB]] int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) { return vmls_lane_s16(a, b, c, 3); } // CHECK-LABEL: @test_vmls_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK: ret <2 x i32> [[SUB]] int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) { return vmls_lane_s32(a, b, c, 1); } // CHECK-LABEL: @test_vmls_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK: ret <4 x i16> [[SUB]] uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vmls_lane_u16(a, b, c, 3); } // CHECK-LABEL: @test_vmls_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK: ret <2 x i32> [[SUB]] uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vmls_lane_u32(a, b, c, 1); } // CHECK-LABEL: @test_vmls_lane_f32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]] // CHECK: ret <2 x float> [[SUB]] float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) { return vmls_lane_f32(a, b, c, 1); } // CHECK-LABEL: @test_vmlsq_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK: ret <8 x i16> [[SUB]] int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) { return vmlsq_lane_s16(a, b, c, 3); } // CHECK-LABEL: @test_vmlsq_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) { return vmlsq_lane_s32(a, b, c, 1); } // CHECK-LABEL: @test_vmlsq_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK: ret <8 x i16> [[SUB]] uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) { return vmlsq_lane_u16(a, b, c, 3); } // CHECK-LABEL: @test_vmlsq_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK: ret <4 x i32> [[SUB]] uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) { return vmlsq_lane_u32(a, b, c, 1); } // CHECK-LABEL: @test_vmlsq_lane_f32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]] // CHECK: ret <4 x float> [[SUB]] float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) { return vmlsq_lane_f32(a, b, c, 1); @@ -8404,40 +8492,48 @@ poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) { } // CHECK-LABEL: @test_vmull_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #8 // CHECK: ret <4 x i32> [[VMULL2_I]] int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t b) { return vmull_lane_s16(a, b, 3); } // CHECK-LABEL: @test_vmull_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #8 // CHECK: ret <2 x i64> [[VMULL2_I]] int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t b) { return vmull_lane_s32(a, b, 1); } // CHECK-LABEL: @test_vmull_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #8 // CHECK: ret <4 x i32> [[VMULL2_I]] uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t b) { return vmull_lane_u16(a, b, 3); } // CHECK-LABEL: @test_vmull_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #8 // CHECK: ret <2 x i64> [[VMULL2_I]] uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t b) { return vmull_lane_u32(a, b, 1); @@ -8506,80 +8602,100 @@ poly8x16_t test_vmulq_p8(poly8x16_t a, poly8x16_t b) { } // CHECK-LABEL: @test_vmul_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK: ret <4 x i16> [[MUL]] int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t b) { return vmul_lane_s16(a, b, 3); } // CHECK-LABEL: @test_vmul_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK: ret <2 x i32> [[MUL]] int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t b) { return vmul_lane_s32(a, b, 1); } // CHECK-LABEL: @test_vmul_lane_f32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] // CHECK: ret <2 x float> [[MUL]] float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t b) { return vmul_lane_f32(a, b, 1); } // CHECK-LABEL: @test_vmul_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK: ret <4 x i16> [[MUL]] uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t b) { return vmul_lane_u16(a, b, 3); } // CHECK-LABEL: @test_vmul_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK: ret <2 x i32> [[MUL]] uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t b) { return vmul_lane_u32(a, b, 1); } // CHECK-LABEL: @test_vmulq_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK: ret <8 x i16> [[MUL]] int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t b) { return vmulq_lane_s16(a, b, 3); } // CHECK-LABEL: @test_vmulq_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK: ret <4 x i32> [[MUL]] int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t b) { return vmulq_lane_s32(a, b, 1); } // CHECK-LABEL: @test_vmulq_lane_f32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] // CHECK: ret <4 x float> [[MUL]] float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t b) { return vmulq_lane_f32(a, b, 1); } // CHECK-LABEL: @test_vmulq_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK: ret <8 x i16> [[MUL]] uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t b) { return vmulq_lane_u16(a, b, 3); } // CHECK-LABEL: @test_vmulq_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK: ret <4 x i32> [[MUL]] uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t b) { return vmulq_lane_u32(a, b, 1); @@ -9700,24 +9816,28 @@ int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { } // CHECK-LABEL: @test_vqdmlal_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) -// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #8 +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #8 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vqdmlal_lane_s16(a, b, c, 3); } // CHECK-LABEL: @test_vqdmlal_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) -// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #8 +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #8 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vqdmlal_lane_s32(a, b, c, 1); @@ -9774,24 +9894,28 @@ int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) { } // CHECK-LABEL: @test_vqdmlsl_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) -// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #8 +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #8 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vqdmlsl_lane_s16(a, b, c, 3); } // CHECK-LABEL: @test_vqdmlsl_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) -// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #8 +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #8 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vqdmlsl_lane_s32(a, b, c, 1); @@ -9866,10 +9990,12 @@ int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) { } // CHECK-LABEL: @test_vqdmulh_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[LANE]]) #8 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> // CHECK: ret <4 x i16> [[VQDMULH_V2_I]] int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t b) { @@ -9877,10 +10003,12 @@ int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t b) { } // CHECK-LABEL: @test_vqdmulh_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[LANE]]) #8 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> // CHECK: ret <2 x i32> [[VQDMULH_V2_I]] int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t b) { @@ -9888,10 +10016,12 @@ int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t b) { } // CHECK-LABEL: @test_vqdmulhq_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> -// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8> +// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[LANE]]) #8 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> // CHECK: ret <8 x i16> [[VQDMULHQ_V2_I]] int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t b) { @@ -9899,10 +10029,12 @@ int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t b) { } // CHECK-LABEL: @test_vqdmulhq_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> -// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[LANE]]) #8 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> // CHECK: ret <4 x i32> [[VQDMULHQ_V2_I]] int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t b) { @@ -9988,10 +10120,12 @@ int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) { } // CHECK-LABEL: @test_vqdmull_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #8 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK: ret <4 x i32> [[VQDMULL_V2_I]] int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t b) { @@ -9999,10 +10133,12 @@ int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t b) { } // CHECK-LABEL: @test_vqdmull_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #8 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK: ret <2 x i64> [[VQDMULL_V2_I]] int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t b) { @@ -10204,10 +10340,12 @@ int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) { } // CHECK-LABEL: @test_vqrdmulh_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[LANE]]) #8 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> // CHECK: ret <4 x i16> [[VQRDMULH_V2_I]] int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t b) { @@ -10215,10 +10353,12 @@ int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t b) { } // CHECK-LABEL: @test_vqrdmulh_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[LANE]]) #8 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> // CHECK: ret <2 x i32> [[VQRDMULH_V2_I]] int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t b) { @@ -10226,10 +10366,12 @@ int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t b) { } // CHECK-LABEL: @test_vqrdmulhq_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> -// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8> +// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[LANE]]) #8 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> // CHECK: ret <8 x i16> [[VQRDMULHQ_V2_I]] int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t b) { @@ -10237,10 +10379,12 @@ int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t b) { } // CHECK-LABEL: @test_vqrdmulhq_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> -// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[LANE]]) #8 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> // CHECK: ret <4 x i32> [[VQRDMULHQ_V2_I]] int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t b) { diff --git a/clang/test/CodeGen/bounds-checking.c b/clang/test/CodeGen/bounds-checking.c index 2e6a08650dd974..15cef8c007a55d 100644 --- a/clang/test/CodeGen/bounds-checking.c +++ b/clang/test/CodeGen/bounds-checking.c @@ -1,7 +1,7 @@ // RUN: %clang_cc1 -fsanitize=local-bounds -emit-llvm -triple x86_64-apple-darwin10 %s -o - | FileCheck %s // RUN: %clang_cc1 -fsanitize=local-bounds -fexperimental-new-pass-manager -emit-llvm -triple x86_64-apple-darwin10 %s -o - | FileCheck %s -// RUN: %clang_cc1 -fsanitize=array-bounds -O -fsanitize-trap=array-bounds -emit-llvm -triple x86_64-apple-darwin10 -DNO_DYNAMIC %s -o - | FileCheck %s -// RUN: %clang_cc1 -fsanitize=array-bounds -O -fsanitize-trap=array-bounds -fexperimental-new-pass-manager -emit-llvm -triple x86_64-apple-darwin10 -DNO_DYNAMIC %s -o - | FileCheck %s +// RUN: %clang_cc1 -fsanitize=array-bounds -O -fsanitize-trap=array-bounds -emit-llvm -triple x86_64-apple-darwin10 -DNO_DYNAMIC %s -o - | FileCheck %s --check-prefixes=CHECK,NONLOCAL +// RUN: %clang_cc1 -fsanitize=array-bounds -O -fsanitize-trap=array-bounds -fexperimental-new-pass-manager -emit-llvm -triple x86_64-apple-darwin10 -DNO_DYNAMIC %s -o - | FileCheck %s --check-prefixes=CHECK,NONLOCAL // // REQUIRES: x86-registered-target @@ -31,3 +31,21 @@ void f3() { // CHECK: call {{.*}} @llvm.trap a[2] = 1; } + +union U { int a[0]; int b[1]; int c[2]; }; + +// CHECK-LABEL: define {{.*}} @f4 +int f4(union U *u, int i) { + // a and b are treated as flexible array members. + // CHECK-NOT: @llvm.trap + return u->a[i] + u->b[i]; + // CHECK: } +} + +// CHECK-LABEL: define {{.*}} @f5 +int f5(union U *u, int i) { + // c is not a flexible array member. + // NONLOCAL: call {{.*}} @llvm.trap + return u->c[i]; + // CHECK: } +} diff --git a/clang/test/CodeGen/debug-info-extern-call.c b/clang/test/CodeGen/debug-info-extern-call.c index da3764f7359eab..072e578b58986c 100644 --- a/clang/test/CodeGen/debug-info-extern-call.c +++ b/clang/test/CodeGen/debug-info-extern-call.c @@ -1,7 +1,7 @@ // When entry values are emitted, expect a subprogram for extern decls so that // the dwarf generator can describe call site parameters at extern call sites. // -// RUN: %clang -Xclang -femit-debug-entry-values -g -O2 -target x86_64-none-linux-gnu -S -emit-llvm %s -o - \ +// RUN: %clang -g -O2 -target x86_64-none-linux-gnu -S -emit-llvm %s -o - \ // RUN: | FileCheck %s -check-prefix=DECLS-FOR-EXTERN // Similarly, when the debugger tuning is gdb, expect a subprogram for extern diff --git a/clang/test/CodeGenCXX/dbg-info-all-calls-described.cpp b/clang/test/CodeGenCXX/dbg-info-all-calls-described.cpp index 667c2469b55eac..e64e07cdb7485c 100644 --- a/clang/test/CodeGenCXX/dbg-info-all-calls-described.cpp +++ b/clang/test/CodeGenCXX/dbg-info-all-calls-described.cpp @@ -15,22 +15,27 @@ // RUN: | FileCheck %s -check-prefix=HAS-ATTR \ // RUN: -implicit-check-not=DISubprogram -implicit-check-not=DIFlagAllCallsDescribed -// Supported: DWARF4 + GDB tuning by using '-femit-debug-entry-values' -// RUN: %clang_cc1 -femit-debug-entry-values -emit-llvm -triple x86_64-linux-gnu \ +// Note: DIFlagAllCallsDescribed may have been enabled prematurely when tuning +// for GDB under -gdwarf-4 in https://reviews.llvm.org/D69743. It's possible +// this should have been 'Unsupported' until entry values emission was enabled +// by default. +// +// Supported: DWARF4 + GDB tuning +// RUN: %clang_cc1 -emit-llvm -triple x86_64-linux-gnu \ // RUN: %s -o - -O1 -disable-llvm-passes -debugger-tuning=gdb \ // RUN: -debug-info-kind=standalone -dwarf-version=4 \ // RUN: | FileCheck %s -check-prefix=HAS-ATTR \ // RUN: -implicit-check-not=DIFlagAllCallsDescribed -// Supported: DWARF4 + LLDB tuning by using '-femit-debug-entry-values' -// RUN: %clang_cc1 -femit-debug-entry-values -emit-llvm -triple x86_64-linux-gnu \ +// Supported: DWARF4 + LLDB, -O1 +// RUN: %clang_cc1 -emit-llvm -triple x86_64-linux-gnu \ // RUN: %s -o - -O1 -disable-llvm-passes -debugger-tuning=lldb \ // RUN: -debug-info-kind=standalone -dwarf-version=4 \ // RUN: | FileCheck %s -check-prefix=HAS-ATTR \ // RUN: -implicit-check-not=DIFlagAllCallsDescribed -// Unsupported: -O0 + '-femit-debug-entry-values' -// RUN: %clang_cc1 -femit-debug-entry-values -emit-llvm -triple x86_64-linux-gnu \ +// Unsupported: -O0 +// RUN: %clang_cc1 -emit-llvm -triple x86_64-linux-gnu \ // RUN: %s -o - -O0 -disable-llvm-passes -debugger-tuning=gdb \ // RUN: -debug-info-kind=standalone -dwarf-version=4 \ // RUN: | FileCheck %s -check-prefix=NO-ATTR diff --git a/clang/test/CodeGenCXX/used-decl-visitor.cpp b/clang/test/CodeGenCXX/used-decl-visitor.cpp new file mode 100644 index 00000000000000..2b923ab562dbda --- /dev/null +++ b/clang/test/CodeGenCXX/used-decl-visitor.cpp @@ -0,0 +1,18 @@ +// RUN: %clang_cc1 -triple x86_64 -emit-llvm -o %t %s + +// Make sure there is no assertion due to UsedDeclVisitor. + +struct A { + int a; +}; + +static A a; + +struct B { + B(int b = a.a) {} +}; + + +void foo() { + B(); +} diff --git a/clang/test/Driver/hip-options.hip b/clang/test/Driver/hip-options.hip index b2ad0424b30610..59afa3fdb2d7b0 100644 --- a/clang/test/Driver/hip-options.hip +++ b/clang/test/Driver/hip-options.hip @@ -8,3 +8,8 @@ // // CHECK: clang{{.*}}" "-cc1" {{.*}} "-fcuda-is-device" // CHECK-SAME: "--gpu-max-threads-per-block=1024" + +// RUN: %clang -### -x hip -target x86_64-pc-windows-msvc -fms-extensions \ +// RUN: -mllvm -amdgpu-early-inline-all=true %s 2>&1 | \ +// RUN: FileCheck -check-prefix=MLLVM %s +// MLLVM-NOT: "-mllvm"{{.*}}"-amdgpu-early-inline-all=true"{{.*}}"-mllvm"{{.*}}"-amdgpu-early-inline-all=true" diff --git a/clang/test/OpenMP/target_ast_print.cpp b/clang/test/OpenMP/target_ast_print.cpp index a36b44bd406efb..0352ae63ff81b7 100644 --- a/clang/test/OpenMP/target_ast_print.cpp +++ b/clang/test/OpenMP/target_ast_print.cpp @@ -353,11 +353,11 @@ int a; template T tmain(T argc, T *argv) { T i, j, a[20], always, close; -#pragma omp target +#pragma omp target device(argc) foo(); -#pragma omp target if (target:argc > 0) +#pragma omp target if (target:argc > 0) device(device_num: C) foo(); -#pragma omp target if (C) +#pragma omp target if (C) device(ancestor: argc) foo(); #pragma omp target map(i) foo(); @@ -458,11 +458,11 @@ T tmain(T argc, T *argv) { // OMP5: template T tmain(T argc, T *argv) { // OMP5-NEXT: T i, j, a[20] -// OMP5-NEXT: #pragma omp target{{$}} +// OMP5-NEXT: #pragma omp target device(argc){{$}} // OMP5-NEXT: foo(); -// OMP5-NEXT: #pragma omp target if(target: argc > 0) +// OMP5-NEXT: #pragma omp target if(target: argc > 0) device(device_num: C) // OMP5-NEXT: foo() -// OMP5-NEXT: #pragma omp target if(C) +// OMP5-NEXT: #pragma omp target if(C) device(ancestor: argc) // OMP5-NEXT: foo() // OMP5-NEXT: #pragma omp target map(tofrom: i) // OMP5-NEXT: foo() @@ -650,11 +650,11 @@ T tmain(T argc, T *argv) { // OMP5-NEXT: foo() // OMP5: template<> char tmain(char argc, char *argv) { // OMP5-NEXT: char i, j, a[20] -// OMP5-NEXT: #pragma omp target +// OMP5-NEXT: #pragma omp target device(argc) // OMP5-NEXT: foo(); -// OMP5-NEXT: #pragma omp target if(target: argc > 0) +// OMP5-NEXT: #pragma omp target if(target: argc > 0) device(device_num: 1) // OMP5-NEXT: foo() -// OMP5-NEXT: #pragma omp target if(1) +// OMP5-NEXT: #pragma omp target if(1) device(ancestor: argc) // OMP5-NEXT: foo() // OMP5-NEXT: #pragma omp target map(tofrom: i) // OMP5-NEXT: foo() diff --git a/clang/test/OpenMP/target_data_device_messages.cpp b/clang/test/OpenMP/target_data_device_messages.cpp index 873402d372ddf5..7610e8413158cc 100644 --- a/clang/test/OpenMP/target_data_device_messages.cpp +++ b/clang/test/OpenMP/target_data_device_messages.cpp @@ -1,6 +1,6 @@ -// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s -Wuninitialized +// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -fopenmp-version=50 -ferror-limit 100 -o - %s -Wuninitialized -// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp-simd -ferror-limit 100 -o - %s -Wuninitialized +// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp-simd -fopenmp-version=50 -ferror-limit 100 -o - %s -Wuninitialized void foo() { } @@ -24,6 +24,7 @@ int main(int argc, char **argv) { #pragma omp target data map(to: a) device (S1) // expected-error {{'S1' does not refer to a value}} #pragma omp target data map(to: a) device (-2) // expected-error {{argument to 'device' clause must be a non-negative integer value}} #pragma omp target data map(to: a) device (-10u) + #pragma omp target data map(to: a) device (ancestor: -10u) // expected-error {{use of undeclared identifier 'ancestor'}} expected-error {{expected ')'}} expected-note {{to match this '('}} #pragma omp target data map(to: a) device (3.14) // expected-error {{expression must have integral or unscoped enumeration type, not 'double'}} foo(); diff --git a/clang/test/OpenMP/target_device_codegen.cpp b/clang/test/OpenMP/target_device_codegen.cpp new file mode 100644 index 00000000000000..8117540d39396f --- /dev/null +++ b/clang/test/OpenMP/target_device_codegen.cpp @@ -0,0 +1,50 @@ +// Test host codegen. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s + +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// SIMD-ONLY0-NOT: {{__kmpc|__tgt}} + +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +void foo(int n) { + + // CHECK: [[N:%.+]] = load i32, i32* [[N_ADDR:%.+]], + // CHECK: store i32 [[N]], i32* [[DEVICE_CAP:%.+]], + // CHECK: [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]], + // CHECK: [[DEVICE:%.+]] = sext i32 [[DEV]] to i64 + // CHECK: [[RET:%.+]] = call i32 @__tgt_target(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 0, i8** null, i8** null, i64* null, i64* null) + // CHECK-NEXT: [[ERROR:%.+]] = icmp ne i32 [[RET]], 0 + // CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]] + // CHECK: [[FAIL]] + // CHECK: call void [[HVT0:@.+]]() + // CHECK-NEXT: br label %[[END]] + // CHECK: [[END]] + #pragma omp target device(n) + ; + // CHECK: [[N:%.+]] = load i32, i32* [[N_ADDR]], + // CHECK: store i32 [[N]], i32* [[DEVICE_CAP:%.+]], + // CHECK: [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]], + // CHECK: [[DEVICE:%.+]] = sext i32 [[DEV]] to i64 + // CHECK: [[RET:%.+]] = call i32 @__tgt_target(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 0, i8** null, i8** null, i64* null, i64* null) + // CHECK-NEXT: [[ERROR:%.+]] = icmp ne i32 [[RET]], 0 + // CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]] + // CHECK: [[FAIL]] + // CHECK: call void [[HVT0:@.+]]() + // CHECK-NEXT: br label %[[END]] + // CHECK: [[END]] + #pragma omp target device(device_num: n) + ; + // CHECK-NOT: call i32 @__tgt_target( + // CHECK: call void @__omp_offloading_{{.+}}_l46() + // CHECK-NOT: call i32 @__tgt_target( + #pragma omp target device(ancestor: n) + ; +} + +#endif diff --git a/clang/test/OpenMP/target_device_messages.cpp b/clang/test/OpenMP/target_device_messages.cpp index 8a583e9f3bd74f..1ca4a88ba8c536 100644 --- a/clang/test/OpenMP/target_device_messages.cpp +++ b/clang/test/OpenMP/target_device_messages.cpp @@ -1,6 +1,8 @@ -// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s -Wuninitialized +// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify=expected,omp45 -fopenmp -fopenmp-version=45 -ferror-limit 100 -o - %s -Wuninitialized +// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify=expected,omp50 -fopenmp -fopenmp-version=50 -ferror-limit 100 -o - %s -Wuninitialized -// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp-simd -ferror-limit 100 -o - %s -Wuninitialized +// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify=expected,omp45 -fopenmp-simd -fopenmp-version=45 -ferror-limit 100 -o - %s -Wuninitialized +// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify=expected,omp50 -fopenmp-simd -fopenmp-version=50 -ferror-limit 100 -o - %s -Wuninitialized void foo() { } @@ -21,11 +23,13 @@ int main(int argc, char **argv) { foo(); #pragma omp target device (argc // expected-error {{expected ')'}} expected-note {{to match this '('}} foo(); + #pragma omp target device (argc: // expected-error {{expected ')'}} expected-note {{to match this '('}} omp50-error {{expected expression}} + foo(); #pragma omp target device (z-argc)) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}} foo(); - #pragma omp target device (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} + #pragma omp target device (device_num : argc > 0 ? argv[1] : argv[2]) // omp45-error {{use of undeclared identifier 'device_num'}} omp45-error {{expected ')'}} omp45-note {{to match this '('}} omp50-error {{expression must have integral or unscoped enumeration type, not 'char *'}} foo(); - #pragma omp target device (argc + argc) + #pragma omp target device (argc: argc + argc) // omp45-error {{expected ')'}} omp45-note {{to match this '('}} omp50-error {{expected 'ancestor' or 'device_num' in OpenMP clause 'device'}} foo(); #pragma omp target device (argc), device (argc+1) // expected-error {{directive '#pragma omp target' cannot contain more than one 'device' clause}} foo(); @@ -37,6 +41,8 @@ int main(int argc, char **argv) { foo(); #pragma omp target device (3.14) // expected-error {{expression must have integral or unscoped enumeration type, not 'double'}} foo(); + #pragma omp target device (ancestor) // expected-error {{use of undeclared identifier 'ancestor'}} + foo(); return 0; } diff --git a/clang/test/OpenMP/target_enter_data_device_messages.cpp b/clang/test/OpenMP/target_enter_data_device_messages.cpp index a233a47152dc0d..7f5356b2fc5f46 100644 --- a/clang/test/OpenMP/target_enter_data_device_messages.cpp +++ b/clang/test/OpenMP/target_enter_data_device_messages.cpp @@ -1,6 +1,6 @@ -// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s -Wuninitialized +// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -fopenmp-version=50 -ferror-limit 100 -o - %s -Wuninitialized -// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp-simd -ferror-limit 100 -o - %s -Wuninitialized +// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp-simd -fopenmp-version=50 -ferror-limit 100 -o - %s -Wuninitialized void foo() { } @@ -24,6 +24,7 @@ int main(int argc, char **argv) { #pragma omp target enter data map(to: i) device (S1) // expected-error {{'S1' does not refer to a value}} #pragma omp target enter data map(to: i) device (-2) // expected-error {{argument to 'device' clause must be a non-negative integer value}} #pragma omp target enter data map(to: i) device (-10u) + #pragma omp target enter data map(to: i) device (device_num: -10u) // expected-error {{use of undeclared identifier 'device_num'}} expected-error {{expected ')'}} expected-note {{to match this '('}} #pragma omp target enter data map(to: i) device (3.14) // expected-error {{expression must have integral or unscoped enumeration type, not 'double'}} foo(); diff --git a/clang/test/OpenMP/target_exit_data_device_messages.cpp b/clang/test/OpenMP/target_exit_data_device_messages.cpp index 035bf6f76a615b..f896882f6f1b0b 100644 --- a/clang/test/OpenMP/target_exit_data_device_messages.cpp +++ b/clang/test/OpenMP/target_exit_data_device_messages.cpp @@ -1,6 +1,6 @@ -// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s -Wuninitialized +// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -fopenmp-version=50 -ferror-limit 100 -o - %s -Wuninitialized -// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp-simd -ferror-limit 100 -o - %s -Wuninitialized +// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp-simd -fopenmp-version=50 -ferror-limit 100 -o - %s -Wuninitialized void foo() { } @@ -24,6 +24,7 @@ int main(int argc, char **argv) { #pragma omp target exit data map(from: i) device (S1) // expected-error {{'S1' does not refer to a value}} #pragma omp target exit data map(from: i) device (-2) // expected-error {{argument to 'device' clause must be a non-negative integer value}} #pragma omp target exit data map(from: i) device (-10u + z) + #pragma omp target exit data map(from: i) device (ancestor: -10u + z) // expected-error {{use of undeclared identifier 'ancestor'}} expected-error {{expected ')'}} expected-note {{to match this '('}} #pragma omp target exit data map(from: i) device (3.14) // expected-error {{expression must have integral or unscoped enumeration type, not 'double'}} foo(); diff --git a/clang/test/OpenMP/target_update_device_messages.cpp b/clang/test/OpenMP/target_update_device_messages.cpp index 8b4aefaf580d8b..fa1c356f9c40c2 100644 --- a/clang/test/OpenMP/target_update_device_messages.cpp +++ b/clang/test/OpenMP/target_update_device_messages.cpp @@ -1,6 +1,6 @@ -// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s -Wuninitialized +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ferror-limit 100 %s -Wuninitialized -// RUN: %clang_cc1 -verify -fopenmp-simd -ferror-limit 100 %s -Wuninitialized +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ferror-limit 100 %s -Wuninitialized void foo() { } @@ -21,6 +21,7 @@ int tmain(T argc, S **argv) { #pragma omp target update to(i) device (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}} #pragma omp target update from(i) device (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} #pragma omp target update from(i) device (argc + z) +#pragma omp target update from(i) device (device_num: argc + z) // expected-error {{use of undeclared identifier 'device_num'}} expected-error {{expected ')'}} expected-note {{to match this '('}} #pragma omp target update from(i) device (argc), device (argc+1) // expected-error {{directive '#pragma omp target update' cannot contain more than one 'device' clause}} #pragma omp target update from(i) device (S1) // expected-error {{'S1' does not refer to a value}} #pragma omp target update from(i) device (3.14) // expected-error 2 {{expression must have integral or unscoped enumeration type, not 'double'}} diff --git a/clang/test/Preprocessor/hexagon-predefines.c b/clang/test/Preprocessor/hexagon-predefines.c index 5be8b96e290dc4..54013ceffa6453 100644 --- a/clang/test/Preprocessor/hexagon-predefines.c +++ b/clang/test/Preprocessor/hexagon-predefines.c @@ -101,3 +101,15 @@ // RUN: -target-feature +hvxv67 -target-feature +hvx-length128b %s | FileCheck \ // RUN: %s -check-prefix CHECK-ELF // CHECK-ELF: #define __ELF__ 1 + +// RUN: %clang_cc1 -E -dM -triple hexagon-unknown-linux-musl \ +// RUN: -target-cpu hexagonv67 -target-feature +hvxv67 \ +// RUN: -target-feature +hvx-length128b %s | FileCheck \ +// RUN: %s -check-prefix CHECK-LINUX +// CHECK-LINUX: #define __gnu_linux__ 1 +// CHECK-LINUX: #define __linux 1 +// CHECK-LINUX: #define __linux__ 1 +// CHECK-LINUX: #define __unix 1 +// CHECK-LINUX: #define __unix__ 1 +// CHECK-LINUX: #define linux 1 +// CHECK-LINUX: #define unix 1 diff --git a/clang/test/SemaTemplate/dependent-typos-recovery.cpp b/clang/test/SemaTemplate/dependent-typos-recovery.cpp new file mode 100644 index 00000000000000..d05b7144d908b3 --- /dev/null +++ b/clang/test/SemaTemplate/dependent-typos-recovery.cpp @@ -0,0 +1,10 @@ +// RUN: %clang_cc1 -fsyntax-only -verify %s + +// There should be no extra errors about missing 'template' keywords. +struct B { + template + int f(){}; +} builder; // expected-note 2{{'builder' declared here}} + +auto a = bilder.f(); // expected-error{{undeclared identifier 'bilder'; did you mean}} +auto b = (*(&bilder+0)).f(); // expected-error{{undeclared identifier 'bilder'; did you mean}} diff --git a/clang/unittests/CodeGen/CodeGenExternalTest.cpp b/clang/unittests/CodeGen/CodeGenExternalTest.cpp index 8dff45c8a0f53a..255b8c3e9d8cdc 100644 --- a/clang/unittests/CodeGen/CodeGenExternalTest.cpp +++ b/clang/unittests/CodeGen/CodeGenExternalTest.cpp @@ -199,7 +199,7 @@ static void test_codegen_fns(MyASTConsumer *my) { dbgs() << "\n"; } - llvm::CompositeType* structTy = dyn_cast(llvmTy); + auto* structTy = dyn_cast(llvmTy); ASSERT_TRUE(structTy != NULL); // Check getLLVMFieldNumber diff --git a/clang/unittests/Format/FormatTestCSharp.cpp b/clang/unittests/Format/FormatTestCSharp.cpp index 03ebe337e76c80..9746f6e15322be 100644 --- a/clang/unittests/Format/FormatTestCSharp.cpp +++ b/clang/unittests/Format/FormatTestCSharp.cpp @@ -628,7 +628,6 @@ TEST_F(FormatTestCSharp, CSharpSpaces) { verifyFormat(R"(catch (TestException) when (innerFinallyExecuted))", Style); verifyFormat(R"(private float[,] Values;)", Style); verifyFormat(R"(Result this[Index x] => Foo(x);)", Style); - verifyFormat(R"(class ItemFactory where T : new() {})", Style); Style.SpacesInSquareBrackets = true; verifyFormat(R"(private float[ , ] Values;)", Style); @@ -673,5 +672,22 @@ if (someThings[i][j][k].Contains(myThing)) { Style); } +TEST_F(FormatTestCSharp, CSharpGenericTypeConstraints) { + FormatStyle Style = getGoogleStyle(FormatStyle::LK_CSharp); + + verifyFormat(R"(// +class ItemFactory + where T : new() {})", Style); + + verifyFormat(R"(// +class Dictionary + where TKey : IComparable + where TVal : IMyInterface { + public void MyMethod(T t) + where T : IMyInterface { doThing(); } +})", + Style); +} + } // namespace format } // end namespace clang diff --git a/clang/unittests/StaticAnalyzer/RegisterCustomCheckersTest.cpp b/clang/unittests/StaticAnalyzer/RegisterCustomCheckersTest.cpp index d0cf291eb2b8e8..8027d3338b6994 100644 --- a/clang/unittests/StaticAnalyzer/RegisterCustomCheckersTest.cpp +++ b/clang/unittests/StaticAnalyzer/RegisterCustomCheckersTest.cpp @@ -81,6 +81,66 @@ TEST(RegisterCustomCheckers, CheckLocationIncDec) { runCheckerOnCode("void f() { int *p; (*p)++; }")); } +//===----------------------------------------------------------------------===// +// Unsatisfied checker dependency +//===----------------------------------------------------------------------===// + +class PrerequisiteChecker : public Checker { +public: + void checkASTCodeBody(const Decl *D, AnalysisManager &Mgr, + BugReporter &BR) const { + BR.EmitBasicReport(D, this, "Prerequisite", categories::LogicError, + "This is the prerequisite checker", + PathDiagnosticLocation(D, Mgr.getSourceManager()), {}); + } +}; + +void registerPrerequisiteChecker(CheckerManager &mgr) { + mgr.registerChecker(); +} + +bool shouldRegisterPrerequisiteChecker(const LangOptions &LO) { + return false; +} + +class DependentChecker : public Checker { +public: + void checkASTCodeBody(const Decl *D, AnalysisManager &Mgr, + BugReporter &BR) const { + BR.EmitBasicReport(D, this, "Dependent", categories::LogicError, + "This is the Dependent Checker", + PathDiagnosticLocation(D, Mgr.getSourceManager()), {}); + } +}; + +void registerDependentChecker(CheckerManager &mgr) { + mgr.registerChecker(); +} + +bool shouldRegisterDependentChecker(const LangOptions &LO) { + return true; +} + +void addDependentChecker(AnalysisASTConsumer &AnalysisConsumer, + AnalyzerOptions &AnOpts) { + AnOpts.CheckersAndPackages = {{"custom.Dependent", true}}; + AnalysisConsumer.AddCheckerRegistrationFn([](CheckerRegistry &Registry) { + Registry.addChecker(registerPrerequisiteChecker, + shouldRegisterPrerequisiteChecker, + "custom.Prerequisite", "Description", "", false); + Registry.addChecker(registerDependentChecker, + shouldRegisterDependentChecker, + "custom.Dependent", "Description", "", false); + Registry.addDependency("custom.Dependent", "custom.Prerequisite"); + }); +} + +TEST(RegisterDependentCheckers, RegisterChecker) { + std::string Diags; + EXPECT_TRUE(runCheckerOnCode("void f() {;}", Diags)); + EXPECT_EQ(Diags, ""); +} + } // namespace } // namespace ento } // namespace clang diff --git a/clang/unittests/Tooling/Syntax/TreeTest.cpp b/clang/unittests/Tooling/Syntax/TreeTest.cpp index 3bb3a88e43670e..cfed1eefbfe3b8 100644 --- a/clang/unittests/Tooling/Syntax/TreeTest.cpp +++ b/clang/unittests/Tooling/Syntax/TreeTest.cpp @@ -99,6 +99,7 @@ class SyntaxTreeTest : public ::testing::Test { Diags->setClient(new IgnoringDiagConsumer); // Prepare to run a compiler. std::vector Args = {"syntax-test", "-std=c++11", + "-fno-delayed-template-parsing", "-fsyntax-only", FileName}; Invocation = createInvocationFromCommandLine(Args, Diags, FS); assert(Invocation); diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp index 59ea15493f037d..9166e7a718ec52 100644 --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -27,8 +27,9 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/None.h" -#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Casting.h" @@ -238,6 +239,11 @@ class Type { NumVectors = 1; } + void make32BitElement() { + assert_with_loc(Bitwidth > 32, "Not enough bits to make it 32!"); + ElementBitwidth = 32; + } + void doubleLanes() { assert_with_loc(Bitwidth != 128, "Can't get bigger than 128!"); Bitwidth = 128; @@ -518,7 +524,8 @@ class Intrinsic { std::pair emitDagDupTyped(DagInit *DI); std::pair emitDagShuffle(DagInit *DI); std::pair emitDagCast(DagInit *DI, bool IsBitCast); - std::pair emitDagCall(DagInit *DI); + std::pair emitDagCall(DagInit *DI, + bool MatchMangledName); std::pair emitDagNameReplace(DagInit *DI); std::pair emitDagLiteral(DagInit *DI); std::pair emitDagOp(DagInit *DI); @@ -546,7 +553,8 @@ class NeonEmitter { public: /// Called by Intrinsic - this attempts to get an intrinsic that takes /// the given types as arguments. - Intrinsic &getIntrinsic(StringRef Name, ArrayRef Types); + Intrinsic &getIntrinsic(StringRef Name, ArrayRef Types, + Optional MangledName); /// Called by Intrinsic - returns a globally-unique number. unsigned getUniqueNumber() { return UniqueNumber++; } @@ -1383,8 +1391,8 @@ std::pair Intrinsic::DagEmitter::emitDag(DagInit *DI) { return emitDagSaveTemp(DI); if (Op == "op") return emitDagOp(DI); - if (Op == "call") - return emitDagCall(DI); + if (Op == "call" || Op == "call_mangled") + return emitDagCall(DI, Op == "call_mangled"); if (Op == "name_replace") return emitDagNameReplace(DI); if (Op == "literal") @@ -1411,7 +1419,8 @@ std::pair Intrinsic::DagEmitter::emitDagOp(DagInit *DI) { } } -std::pair Intrinsic::DagEmitter::emitDagCall(DagInit *DI) { +std::pair +Intrinsic::DagEmitter::emitDagCall(DagInit *DI, bool MatchMangledName) { std::vector Types; std::vector Values; for (unsigned I = 0; I < DI->getNumArgs() - 1; ++I) { @@ -1427,7 +1436,13 @@ std::pair Intrinsic::DagEmitter::emitDagCall(DagInit *DI) { N = SI->getAsUnquotedString(); else N = emitDagArg(DI->getArg(0), "").second; - Intrinsic &Callee = Intr.Emitter.getIntrinsic(N, Types); + Optional MangledName; + if (MatchMangledName) { + if (Intr.getRecord()->getValueAsBit("isLaneQ")) + N += "q"; + MangledName = Intr.mangleName(N, ClassS); + } + Intrinsic &Callee = Intr.Emitter.getIntrinsic(N, Types, MangledName); // Make sure the callee is known as an early def. Callee.setNeededEarly(); @@ -1486,6 +1501,8 @@ std::pair Intrinsic::DagEmitter::emitDagCast(DagInit *DI, castToType.doubleLanes(); } else if (SI->getAsUnquotedString() == "8") { castToType.makeInteger(8, true); + } else if (SI->getAsUnquotedString() == "32") { + castToType.make32BitElement(); } else { castToType = Type::fromTypedefName(SI->getAsUnquotedString()); assert_with_loc(!castToType.isVoid(), "Unknown typedef"); @@ -1832,7 +1849,8 @@ void Intrinsic::indexBody() { // NeonEmitter implementation //===----------------------------------------------------------------------===// -Intrinsic &NeonEmitter::getIntrinsic(StringRef Name, ArrayRef Types) { +Intrinsic &NeonEmitter::getIntrinsic(StringRef Name, ArrayRef Types, + Optional MangledName) { // First, look up the name in the intrinsic map. assert_with_loc(IntrinsicMap.find(Name.str()) != IntrinsicMap.end(), ("Intrinsic '" + Name + "' not found!").str()); @@ -1861,17 +1879,19 @@ Intrinsic &NeonEmitter::getIntrinsic(StringRef Name, ArrayRef Types) { } ErrMsg += ")\n"; + if (MangledName && MangledName != I.getMangledName(true)) + continue; + if (I.getNumParams() != Types.size()) continue; - bool Good = true; - for (unsigned Arg = 0; Arg < Types.size(); ++Arg) { - if (I.getParamType(Arg) != Types[Arg]) { - Good = false; - break; - } - } - if (Good) + unsigned ArgNum = 0; + bool MatchingArgumentTypes = + std::all_of(Types.begin(), Types.end(), [&](const auto &Type) { + return Type == I.getParamType(ArgNum++); + }); + + if (MatchingArgumentTypes) GoodVec.push_back(&I); } diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp index 8b53e376cb0d7d..e02e94dd98ae62 100644 --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -100,6 +100,10 @@ class SVEType { /// string for passing to the BUILTIN() macro in Builtins.def. std::string builtin_str() const; + /// Return the C/C++ string representation of a type for use in the + /// arm_sve.h header file. + std::string str() const; + private: /// Creates the type based on the typespec string in TS. void applyTypespec(); @@ -335,6 +339,45 @@ std::string SVEType::builtin_str() const { return "q" + utostr(getNumElements() * NumVectors) + S; } +std::string SVEType::str() const { + if (isPredicatePattern()) + return "sv_pattern"; + + if (isPrefetchOp()) + return "sv_prfop"; + + std::string S; + if (Void) + S += "void"; + else { + if (isScalableVector()) + S += "sv"; + if (!Signed && !Float) + S += "u"; + + if (Float) + S += "float"; + else if (isScalarPredicate()) + S += "bool"; + else + S += "int"; + + if (!isScalarPredicate()) + S += utostr(ElementBitwidth); + if (!isScalableVector() && isVector()) + S += "x" + utostr(getNumElements()); + if (NumVectors > 1) + S += "x" + utostr(NumVectors); + S += "_t"; + } + + if (Constant) + S += " const"; + if (Pointer) + S += " *"; + + return S; +} void SVEType::applyTypespec() { for (char I : TS) { switch (I) { @@ -515,8 +558,19 @@ void Intrinsic::emitIntrinsic(raw_ostream &OS) const { << "(...) __builtin_sve_" << mangleName(ClassS) << "(__VA_ARGS__)\n"; } else { - llvm_unreachable("Not yet implemented. Overloaded intrinsics will follow " - "in a future patch"); + std::string FullName = mangleName(ClassS); + std::string ProtoName = mangleName(ClassG); + + OS << "__aio __attribute__((__clang_arm_builtin_alias(" + << "__builtin_sve_" << FullName << ")))\n"; + + OS << getTypes()[0].str() << " " << ProtoName << "("; + for (unsigned I = 0; I < getTypes().size() - 1; ++I) { + if (I != 0) + OS << ", "; + OS << getTypes()[I + 1].str(); + } + OS << ");\n"; } } @@ -559,6 +613,11 @@ void SVEEmitter::createIntrinsic( Out.push_back(std::make_unique(Name, Proto, Merge, LLVMName, Flags, TS, ClassS, *this, Guard)); + + // Also generate the short-form (e.g. svadd_m) for the given type-spec. + if (Intrinsic::isOverloadedIntrinsic(Name)) + Out.push_back(std::make_unique( + Name, Proto, Merge, LLVMName, Flags, TS, ClassG, *this, Guard)); } } @@ -608,6 +667,10 @@ void SVEEmitter::createHeader(raw_ostream &OS) { OS << "typedef __SVFloat64_t svfloat64_t;\n"; OS << "typedef __SVBool_t svbool_t;\n\n"; + OS << "/* Function attributes */\n"; + OS << "#define __aio static inline __attribute__((__always_inline__, " + "__nodebug__, __overloadable__))\n\n"; + SmallVector, 128> Defs; std::vector RV = Records.getAllDerivedDefinitions("Inst"); for (auto *R : RV) diff --git a/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp b/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp index ae65dd3fd99519..fdda7013fe5c86 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp @@ -41,6 +41,10 @@ #include #include +#if __has_feature(ptrauth_calls) +#include +#endif + namespace __tsan { #if !SANITIZER_GO @@ -274,6 +278,10 @@ void InitializePlatform() { uptr ExtractLongJmpSp(uptr *env) { uptr mangled_sp = env[LONG_JMP_SP_ENV_SLOT]; uptr sp = mangled_sp ^ longjmp_xor_key; +#if __has_feature(ptrauth_calls) + sp = (uptr)ptrauth_auth_data((void *)sp, ptrauth_key_asdb, + ptrauth_string_discriminator("sp")); +#endif return sp; } diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt index fd750a06e0e14a..4ee4d7dca57334 100644 --- a/libc/CMakeLists.txt +++ b/libc/CMakeLists.txt @@ -21,6 +21,7 @@ set(LIBC_TARGET_MACHINE ${CMAKE_SYSTEM_PROCESSOR}) include(CMakeParseArguments) include(LLVMLibCRules) +include(LLVMLibCCheckCpuFeatures) add_subdirectory(src) add_subdirectory(config) diff --git a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake index adf81f3e38ab90..0bb4af869487b3 100644 --- a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake +++ b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake @@ -1,99 +1,129 @@ -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ # Cpu features definition and flags -# -# Declare a list of all supported cpu features in ALL_CPU_FEATURES. -# -# Declares associated flags to enable/disable individual feature of the form: -# - CPU_FEATURE__ENABLE_FLAG -# - CPU_FEATURE__DISABLE_FLAG -# -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ if(${LIBC_TARGET_MACHINE} MATCHES "x86|x86_64") - set(ALL_CPU_FEATURES SSE SSE2 AVX AVX512F) + set(ALL_CPU_FEATURES SSE SSE2 AVX AVX2 AVX512F) endif() -function(_define_cpu_feature_flags feature) - if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") - string(TOLOWER ${feature} lowercase_feature) - set(CPU_FEATURE_${feature}_ENABLE_FLAG "-m${lowercase_feature}" PARENT_SCOPE) - set(CPU_FEATURE_${feature}_DISABLE_FLAG "-mno-${lowercase_feature}" PARENT_SCOPE) +list(SORT ALL_CPU_FEATURES) + +# Function to check whether the host supports the provided set of features. +# Usage: +# host_supports( +# +# +# ) +function(host_supports output_var features) + _intersection(a "${HOST_CPU_FEATURES}" "${features}") + if("${a}" STREQUAL "${features}") + set(${output_var} TRUE PARENT_SCOPE) + else() + unset(${output_var} PARENT_SCOPE) + endif() +endfunction() + +# Function to compute the flags to pass down to the compiler. +# Usage: +# compute_flags( +# +# MARCH +# REQUIRE +# REJECT +# ) +function(compute_flags output_var) + cmake_parse_arguments( + "COMPUTE_FLAGS" + "" # Optional arguments + "MARCH" # Single value arguments + "REQUIRE;REJECT" # Multi value arguments + ${ARGN}) + # Check that features are not required and rejected at the same time. + if(COMPUTE_FLAGS_REQUIRE AND COMPUTE_FLAGS_REJECT) + _intersection(var ${COMPUTE_FLAGS_REQUIRE} ${COMPUTE_FLAGS_REJECT}) + if(var) + message(FATAL_ERROR "Cpu Features REQUIRE and REJECT ${var}") + endif() + endif() + # Generate the compiler flags in `current`. + if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang|GNU") + if(COMPUTE_FLAGS_MARCH) + list(APPEND current "-march=${COMPUTE_FLAGS_MARCH}") + endif() + foreach(feature IN LISTS COMPUTE_FLAGS_REQUIRE) + string(TOLOWER ${feature} lowercase_feature) + list(APPEND current "-m${lowercase_feature}") + endforeach() + foreach(feature IN LISTS COMPUTE_FLAGS_REJECT) + string(TOLOWER ${feature} lowercase_feature) + list(APPEND current "-mno-${lowercase_feature}") + endforeach() else() # In future, we can extend for other compilers. message(FATAL_ERROR "Unkown compiler ${CMAKE_CXX_COMPILER_ID}.") endif() + # Export the list of flags. + set(${output_var} "${current}" PARENT_SCOPE) endfunction() -# Defines cpu features flags -foreach(feature IN LISTS ALL_CPU_FEATURES) - _define_cpu_feature_flags(${feature}) -endforeach() - -#------------------------------------------------------------------------------ -# Optimization level flags -# -# Generates the set of flags needed to compile for a up to a particular -# optimization level. -# -# Creates variables of the form `CPU_FEATURE_OPT__FLAGS`. -# CPU_FEATURE_OPT_NONE_FLAGS is a special flag for which no feature is needed. -# -# e.g. -# CPU_FEATURE_OPT_NONE_FLAGS : -mno-sse;-mno-sse2;-mno-avx;-mno-avx512f -# CPU_FEATURE_OPT_SSE_FLAGS : -msse;-mno-sse2;-mno-avx;-mno-avx512f -# CPU_FEATURE_OPT_SSE2_FLAGS : -msse;-msse2;-mno-avx;-mno-avx512f -# CPU_FEATURE_OPT_AVX_FLAGS : -msse;-msse2;-mavx;-mno-avx512f -# CPU_FEATURE_OPT_AVX512F_FLAGS : -msse;-msse2;-mavx;-mavx512f -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ +# Internal helpers and utilities. +# ------------------------------------------------------------------------------ -# Helper function to concatenate flags needed to support optimization up to -# a particular feature. -function(_generate_flags_for_up_to feature flag_variable) - list(FIND ALL_CPU_FEATURES ${feature} feature_index) - foreach(current_feature IN LISTS ALL_CPU_FEATURES) - list(FIND ALL_CPU_FEATURES ${current_feature} current_feature_index) - if(${current_feature_index} GREATER ${feature_index}) - list(APPEND flags ${CPU_FEATURE_${current_feature}_DISABLE_FLAG}) - else() - list(APPEND flags ${CPU_FEATURE_${current_feature}_ENABLE_FLAG}) +# Computes the intersection between two lists. +function(_intersection output_var list1 list2) + foreach(element IN LISTS list1) + if("${list2}" MATCHES "(^|;)${element}(;|$)") + list(APPEND tmp "${element}") endif() endforeach() - set(${flag_variable} ${flags} PARENT_SCOPE) + set(${output_var} ${tmp} PARENT_SCOPE) endfunction() -function(_generate_opt_levels) - set(opt_levels NONE) - list(APPEND opt_levels ${ALL_CPU_FEATURES}) - foreach(feature IN LISTS opt_levels) - set(flag_name "CPU_FEATURE_OPT_${feature}_FLAGS") - _generate_flags_for_up_to(${feature} ${flag_name}) - set(${flag_name} ${${flag_name}} PARENT_SCOPE) +# Generates a cpp file to introspect the compiler defined flags. +function(_generate_check_code) + foreach(feature IN LISTS ALL_CPU_FEATURES) + set(DEFINITIONS + "${DEFINITIONS} +#ifdef __${feature}__ + \"${feature}\", +#endif") endforeach() + configure_file( + "${LIBC_SOURCE_DIR}/cmake/modules/cpu_features/check_cpu_features.cpp.in" + "cpu_features/check_cpu_features.cpp" @ONLY) endfunction() +_generate_check_code() -_generate_opt_levels() - -#------------------------------------------------------------------------------ -# Host cpu feature introspection -# -# Populates a HOST_CPU_FEATURES list containing the available CPU_FEATURE. -#------------------------------------------------------------------------------ -function(_check_host_cpu_feature feature) - string(TOLOWER ${feature} lowercase_feature) +# Compiles and runs the code generated above with the specified requirements. +# This is helpful to infer which features a particular target supports or if +# a specific features implies other features (e.g. BMI2 implies SSE2 and SSE). +function(_check_defined_cpu_feature output_var) + cmake_parse_arguments( + "CHECK_DEFINED" + "" # Optional arguments + "MARCH" # Single value arguments + "REQUIRE;REJECT" # Multi value arguments + ${ARGN}) + compute_flags( + flags + MARCH ${CHECK_DEFINED_MARCH} + REQUIRE ${CHECK_DEFINED_REQUIRE} + REJECT ${CHECK_DEFINED_REJECT}) try_run( - run_result - compile_result - "${CMAKE_CURRENT_BINARY_DIR}/check_${lowercase_feature}" - "${CMAKE_MODULE_PATH}/cpu_features/check_${lowercase_feature}.cpp" - COMPILE_DEFINITIONS ${CPU_FEATURE_${feature}_ENABLE_FLAG} - OUTPUT_VARIABLE compile_output - ) + run_result compile_result "${CMAKE_CURRENT_BINARY_DIR}/check_${feature}" + "${CMAKE_CURRENT_BINARY_DIR}/cpu_features/check_cpu_features.cpp" + COMPILE_DEFINITIONS ${flags} + COMPILE_OUTPUT_VARIABLE compile_output + RUN_OUTPUT_VARIABLE run_output) if(${compile_result} AND ("${run_result}" EQUAL 0)) - list(APPEND HOST_CPU_FEATURES ${feature}) - set(HOST_CPU_FEATURES ${HOST_CPU_FEATURES} PARENT_SCOPE) + set(${output_var} + "${run_output}" + PARENT_SCOPE) + else() + message(FATAL_ERROR "${compile_output}") endif() endfunction() -foreach(feature IN LISTS ALL_CPU_FEATURES) - _check_host_cpu_feature(${feature}) -endforeach() +# Populates the HOST_CPU_FEATURES list. +_check_defined_cpu_feature(HOST_CPU_FEATURES MARCH native) diff --git a/libc/cmake/modules/LLVMLibCRules.cmake b/libc/cmake/modules/LLVMLibCRules.cmake index 18e1d0a081c3ab..2391ea50b0db55 100644 --- a/libc/cmake/modules/LLVMLibCRules.cmake +++ b/libc/cmake/modules/LLVMLibCRules.cmake @@ -372,6 +372,7 @@ endfunction(add_redirector_library) # SRCS # HDRS # DEPENDS +# COMPILE_OPTIONS # ) function(add_libc_unittest target_name) if(NOT LLVM_INCLUDE_TESTS) @@ -382,7 +383,7 @@ function(add_libc_unittest target_name) "LIBC_UNITTEST" "" # No optional arguments "SUITE" # Single value arguments - "SRCS;HDRS;DEPENDS" # Multi-value arguments + "SRCS;HDRS;DEPENDS;COMPILE_OPTIONS" # Multi-value arguments ${ARGN} ) if(NOT LIBC_UNITTEST_SRCS) @@ -420,6 +421,12 @@ function(add_libc_unittest target_name) ${LIBC_BUILD_DIR} ${LIBC_BUILD_DIR}/include ) + if(LIBC_UNITTEST_COMPILE_OPTIONS) + target_compile_options( + ${target_name} + PRIVATE ${LIBC_UNITTEST_COMPILE_OPTIONS} + ) + endif() if(library_deps) target_link_libraries(${target_name} PRIVATE ${library_deps}) diff --git a/libc/cmake/modules/cpu_features/check_avx.cpp b/libc/cmake/modules/cpu_features/check_avx.cpp deleted file mode 100644 index f0db3abab4e5fd..00000000000000 --- a/libc/cmake/modules/cpu_features/check_avx.cpp +++ /dev/null @@ -1,8 +0,0 @@ -#if !defined __AVX__ -#error "missing __AVX__" -#endif -#include -int main() { - (void)_mm256_set1_epi8('0'); - return 0; -} diff --git a/libc/cmake/modules/cpu_features/check_avx512f.cpp b/libc/cmake/modules/cpu_features/check_avx512f.cpp deleted file mode 100644 index 93444e737ef4fb..00000000000000 --- a/libc/cmake/modules/cpu_features/check_avx512f.cpp +++ /dev/null @@ -1,8 +0,0 @@ -#if !defined __AVX512F__ -#error "missing __AVX512F__" -#endif -#include -int main() { - (void)_mm512_undefined(); - return 0; -} diff --git a/libc/cmake/modules/cpu_features/check_cpu_features.cpp.in b/libc/cmake/modules/cpu_features/check_cpu_features.cpp.in new file mode 100644 index 00000000000000..25f67a63e192a2 --- /dev/null +++ b/libc/cmake/modules/cpu_features/check_cpu_features.cpp.in @@ -0,0 +1,29 @@ +#include +#include + +// This file is instantiated by CMake. +// DEFINITIONS below is replaced with a set of lines like so: +// #ifdef __SSE2__ +// "SSE2", +// #endif +// +// This allows for introspection of compiler definitions. +// The output of the program is a single line of semi colon separated feature +// names. + +// MSVC is using a different set of preprocessor definitions for +// SSE and SSE2, see _M_IX86_FP in +// https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros + +int main(int, char **) { + const char *strings[] = { + @DEFINITIONS@ + }; + const size_t size = sizeof(strings) / sizeof(strings[0]); + for (size_t i = 0; i < size; ++i) { + if (i) + putchar(';'); + fputs(strings[i], stdout); + } + return EXIT_SUCCESS; +} diff --git a/libc/cmake/modules/cpu_features/check_sse.cpp b/libc/cmake/modules/cpu_features/check_sse.cpp deleted file mode 100644 index 1c1f67179fded6..00000000000000 --- a/libc/cmake/modules/cpu_features/check_sse.cpp +++ /dev/null @@ -1,8 +0,0 @@ -#if !defined __SSE__ -#error "missing __SSE__" -#endif -#include -int main() { - (void)_mm_set_ss(1.0f); - return 0; -} diff --git a/libc/cmake/modules/cpu_features/check_sse2.cpp b/libc/cmake/modules/cpu_features/check_sse2.cpp deleted file mode 100644 index f1e598de587755..00000000000000 --- a/libc/cmake/modules/cpu_features/check_sse2.cpp +++ /dev/null @@ -1,8 +0,0 @@ -#if !defined __SSE2__ -#error "missing __SSE2__" -#endif -#include -int main() { - (void)_mm_set1_epi8('0'); - return 0; -} diff --git a/libc/config/linux/CMakeLists.txt b/libc/config/linux/CMakeLists.txt index 86b178abb0dec8..91135bedcd50b3 100644 --- a/libc/config/linux/CMakeLists.txt +++ b/libc/config/linux/CMakeLists.txt @@ -6,6 +6,8 @@ add_gen_header( inline_syscalls=${LIBC_TARGET_MACHINE}/syscall.h.inc DATA_FILES ${LIBC_TARGET_MACHINE}/syscall.h.inc + DEPENDS + support_common_h ) add_subdirectory(x86_64) diff --git a/libc/lib/CMakeLists.txt b/libc/lib/CMakeLists.txt index 832d79c1e859e9..b234c91704a9d0 100644 --- a/libc/lib/CMakeLists.txt +++ b/libc/lib/CMakeLists.txt @@ -11,6 +11,7 @@ add_entrypoint_library( # string.h entrypoints strcpy strcat + memcpy # sys/mman.h entrypoints mmap diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt index 49b0afd2cedf9e..729ccaaa2b20a6 100644 --- a/libc/src/string/CMakeLists.txt +++ b/libc/src/string/CMakeLists.txt @@ -1,3 +1,5 @@ +add_subdirectory(memory_utils) + add_entrypoint_object( strcat SRCS @@ -19,4 +21,63 @@ add_entrypoint_object( string_h ) -add_subdirectory(memory_utils) +# ------------------------------------------------------------------------------ +# memcpy +# ------------------------------------------------------------------------------ + +# include the relevant architecture specific implementations +if(${LIBC_TARGET_MACHINE} STREQUAL "x86_64") + set(LIBC_MEMCPY_IMPL_FOLDER "x86") +else() + set(LIBC_MEMCPY_IMPL_FOLDER ${LIBC_TARGET_MACHINE}) +endif() + +add_gen_header( + memcpy_arch_specific + DEF_FILE + memcpy_arch_specific.h.def + GEN_HDR + memcpy_arch_specific.h + PARAMS + memcpy_arch_specific=${LIBC_MEMCPY_IMPL_FOLDER}/memcpy_arch_specific.h.inc + DATA_FILES + ${LIBC_MEMCPY_IMPL_FOLDER}/memcpy_arch_specific.h.inc +) + +# Helper to define an implementation of memcpy. +# - Computes flags to satisfy required/rejected features and arch, +# - Declares an entry point, +# - Attach the REQUIRE_CPU_FEATURES property to the target, +# - Add the target to `memcpy_implementations` global property for tests. +function(add_memcpy memcpy_name) + cmake_parse_arguments( + "ADD_MEMCPY" + "" # Optional arguments + "MARCH" # Single value arguments + "REQUIRE;REJECT" # Multi value arguments + ${ARGN}) + compute_flags(flags + MARCH ${ADD_MEMCPY_MARCH} + REQUIRE ${ADD_MEMCPY_REQUIRE} + REJECT ${ADD_MEMCPY_REJECT} + ) + add_entrypoint_object( + ${memcpy_name} + SRCS ${LIBC_SOURCE_DIR}/src/string/memcpy.cpp + HDRS ${LIBC_SOURCE_DIR}/src/string/memcpy.h + DEPENDS + string_h + memory_utils + memcpy_arch_specific + COMPILE_OPTIONS + -fno-builtin-memcpy + ${flags} + ) + set_target_properties(${memcpy_name} PROPERTIES REQUIRE_CPU_FEATURES "${ADD_MEMCPY_REQUIRE}") + get_property(all GLOBAL PROPERTY memcpy_implementations) + list(APPEND all ${memcpy_name}) + set_property(GLOBAL PROPERTY memcpy_implementations "${all}") +endfunction() + +add_subdirectory(${LIBC_MEMCPY_IMPL_FOLDER}) +add_memcpy(memcpy MARCH native) diff --git a/libc/src/string/memcpy.cpp b/libc/src/string/memcpy.cpp new file mode 100644 index 00000000000000..216e22938e7fd0 --- /dev/null +++ b/libc/src/string/memcpy.cpp @@ -0,0 +1,22 @@ +//===--------------------- Implementation of memcpy -----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/string/memcpy.h" +#include "src/__support/common.h" +#include "src/string/memcpy_arch_specific.h" + +namespace __llvm_libc { + +void *LLVM_LIBC_ENTRYPOINT(memcpy)(void *__restrict dst, + const void *__restrict src, size_t size) { + memcpy_no_return(reinterpret_cast(dst), + reinterpret_cast(src), size); + return dst; +} + +} // namespace __llvm_libc diff --git a/libc/src/string/memcpy.h b/libc/src/string/memcpy.h new file mode 100644 index 00000000000000..a3ae4d40c874b1 --- /dev/null +++ b/libc/src/string/memcpy.h @@ -0,0 +1,21 @@ +//===----------------- Implementation header for memcpy -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STRING_MEMCPY_H +#define LLVM_LIBC_SRC_STRING_MEMCPY_H + +#include "include/string.h" +#include // size_t + +namespace __llvm_libc { + +void *memcpy(void *__restrict, const void *__restrict, size_t); + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_STRING_MEMCPY_H diff --git a/libc/src/string/memcpy_arch_specific.h.def b/libc/src/string/memcpy_arch_specific.h.def new file mode 100644 index 00000000000000..a9bb35223ef8b0 --- /dev/null +++ b/libc/src/string/memcpy_arch_specific.h.def @@ -0,0 +1,65 @@ +//===-------------- Implementation of arch specific memcpy ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H +#define LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H + +%%include_file(${memcpy_arch_specific}) + +namespace __llvm_libc { + +// Design rationale +// ================ +// +// Using a profiler to observe size distributions for calls into libc +// functions, it was found most operations act on a small number of bytes. +// This makes it important to favor small sizes. +// +// The tests for `count` are in ascending order so the cost of branching is +// proportional to the cost of copying. +// +// The function is written in C++ for several reasons: +// - The compiler can __see__ the code, this is useful when performing Profile +// Guided Optimization as the optimized code can take advantage of branching +// probabilities. +// - It also allows for easier customization and favors testing multiple +// implementation parameters. +// - As compilers and processors get better, the generated code is improved +// with little change on the code side. +static void memcpy_no_return(char *__restrict dst, const char *__restrict src, + size_t count) { + if (count == 0) + return; + if (count == 1) + return Copy<1>(dst, src); + if (count == 2) + return Copy<2>(dst, src); + if (count == 3) + return Copy<3>(dst, src); + if (count == 4) + return Copy<4>(dst, src); + if (count < 8) + return CopyOverlap<4>(dst, src, count); + if (count == 8) + return Copy<8>(dst, src); + if (count < 16) + return CopyOverlap<8>(dst, src, count); + if (count == 16) + return Copy<16>(dst, src); + if (count < 32) + return CopyOverlap<16>(dst, src, count); + if (count < 64) + return CopyOverlap<32>(dst, src, count); + if (count < 128) + return CopyOverlap<64>(dst, src, count); + CopyGE128(dst, src, count); +} + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt index 259ed0a7582812..b826f1f68707f3 100644 --- a/libc/src/string/memory_utils/CMakeLists.txt +++ b/libc/src/string/memory_utils/CMakeLists.txt @@ -12,6 +12,9 @@ add_gen_header( add_header_library( memory_utils - HDRS utils.h - DEPENDS cacheline_size + HDRS + utils.h + memcpy_utils.h + DEPENDS + cacheline_size ) diff --git a/libc/src/string/memory_utils/memcpy_utils.h b/libc/src/string/memory_utils/memcpy_utils.h new file mode 100644 index 00000000000000..c69e557574c05d --- /dev/null +++ b/libc/src/string/memory_utils/memcpy_utils.h @@ -0,0 +1,100 @@ +//===---------------------------- Memcpy utils ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MEMORY_UTILS_MEMCPY_UTILS_H +#define LLVM_LIBC_SRC_MEMORY_UTILS_MEMCPY_UTILS_H + +#include "src/string/memory_utils/utils.h" +#include // size_t + +// __builtin_memcpy_inline guarantees to never call external functions. +// Unfortunately it is not widely available. +#if defined(__clang__) && __has_builtin(__builtin_memcpy_inline) +#define USE_BUILTIN_MEMCPY_INLINE +#elif defined(__GNUC__) +#define USE_BUILTIN_MEMCPY +#endif + +// This is useful for testing. +#if defined(LLVM_LIBC_MEMCPY_MONITOR) +extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict, + const char *__restrict, size_t); +#endif + +namespace __llvm_libc { + +// Copies `kBlockSize` bytes from `src` to `dst`. +template +static void Copy(char *__restrict dst, const char *__restrict src) { +#if defined(LLVM_LIBC_MEMCPY_MONITOR) + LLVM_LIBC_MEMCPY_MONITOR(dst, src, kBlockSize); +#elif defined(USE_BUILTIN_MEMCPY_INLINE) + __builtin_memcpy_inline(dst, src, kBlockSize); +#elif defined(USE_BUILTIN_MEMCPY) + __builtin_memcpy(dst, src, kBlockSize); +#else + for (size_t i = 0; i < kBlockSize; ++i) + dst[i] = src[i]; +#endif +} + +// Copies `kBlockSize` bytes from `src + count - kBlockSize` to +// `dst + count - kBlockSize`. +// Precondition: `count >= kBlockSize`. +template +static void CopyLastBlock(char *__restrict dst, const char *__restrict src, + size_t count) { + const size_t offset = count - kBlockSize; + Copy(dst + offset, src + offset); +} + +// Copies `kBlockSize` bytes twice with an overlap between the two. +// +// [1234567812345678123] +// [__XXXXXXXXXXXXXX___] +// [__XXXXXXXX_________] +// [________XXXXXXXX___] +// +// Precondition: `count >= kBlockSize && count <= kBlockSize`. +template +static void CopyOverlap(char *__restrict dst, const char *__restrict src, + size_t count) { + Copy(dst, src); + CopyLastBlock(dst, src, count); +} + +// Copies `count` bytes by blocks of `kBlockSize` bytes. +// Copies at the start and end of the buffer are unaligned. +// Copies in the middle of the buffer are aligned to `kBlockSize`. +// +// e.g. with +// [12345678123456781234567812345678] +// [__XXXXXXXXXXXXXXXXXXXXXXXXXXX___] +// [__XXXXXXXX______________________] +// [________XXXXXXXX________________] +// [________________XXXXXXXX________] +// [_____________________XXXXXXXX___] +// +// Precondition: `count > 2 * kBlockSize` for efficiency. +// `count >= kBlockSize` for correctness. +template +static void CopyAligned(char *__restrict dst, const char *__restrict src, + size_t count) { + Copy(dst, src); // Copy first block + + // Copy aligned blocks + size_t offset = kBlockSize - offset_from_last_aligned(dst); + for (; offset + kBlockSize < count; offset += kBlockSize) + Copy(dst + offset, src + offset); + + CopyLastBlock(dst, src, count); // Copy last block +} + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_MEMORY_UTILS_MEMCPY_UTILS_H diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h index 33df113213b5c6..af9b6aeeee5134 100644 --- a/libc/src/string/memory_utils/utils.h +++ b/libc/src/string/memory_utils/utils.h @@ -43,6 +43,11 @@ static constexpr size_t ge_power2(size_t value) { return is_power2_or_zero(value) ? value : 1ULL << (log2(value) + 1); } +template intptr_t offset_from_last_aligned(const void *ptr) { + static_assert(is_power2(alignment), "alignment must be a power of 2"); + return reinterpret_cast(ptr) & (alignment - 1U); +} + template intptr_t offset_to_next_aligned(const void *ptr) { static_assert(is_power2(alignment), "alignment must be a power of 2"); // The logic is not straightforward and involves unsigned modulo arithmetic @@ -51,7 +56,7 @@ template intptr_t offset_to_next_aligned(const void *ptr) { } // Returns the offset from `ptr` to the next cache line. -static intptr_t offset_to_next_cache_line(const void *ptr) { +static inline intptr_t offset_to_next_cache_line(const void *ptr) { return offset_to_next_aligned(ptr); } diff --git a/libc/src/string/x86/CMakeLists.txt b/libc/src/string/x86/CMakeLists.txt new file mode 100644 index 00000000000000..b5365733fb8081 --- /dev/null +++ b/libc/src/string/x86/CMakeLists.txt @@ -0,0 +1,4 @@ +add_memcpy("memcpy_${LIBC_TARGET_MACHINE}_opt_none" REJECT "${ALL_CPU_FEATURES}") +add_memcpy("memcpy_${LIBC_TARGET_MACHINE}_opt_sse" REQUIRE "SSE" REJECT "SSE2") +add_memcpy("memcpy_${LIBC_TARGET_MACHINE}_opt_avx" REQUIRE "AVX" REJECT "AVX2") +add_memcpy("memcpy_${LIBC_TARGET_MACHINE}_opt_avx512f" REQUIRE "AVX512F") diff --git a/libc/src/string/x86/memcpy_arch_specific.h.inc b/libc/src/string/x86/memcpy_arch_specific.h.inc new file mode 100644 index 00000000000000..ace98ba2e81114 --- /dev/null +++ b/libc/src/string/x86/memcpy_arch_specific.h.inc @@ -0,0 +1,35 @@ +#include "src/string/memory_utils/memcpy_utils.h" + +namespace __llvm_libc { + +static void CopyRepMovsb(char *__restrict dst, const char *__restrict src, + size_t count) { + // FIXME: Add MSVC suppport with + // #include + // __movsb(reinterpret_cast(dst), + // reinterpret_cast(src), count); + asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory"); +} + +#if defined(__AVX__) +#define BEST_SIZE 64 +#else +#define BEST_SIZE 32 +#endif + +static void CopyGE128(char *__restrict dst, const char *__restrict src, + size_t count) { +#if defined(__AVX__) + if (count < 256) + return CopyOverlap<128>(dst, src, count); +#endif + // kRepMovsBSize == -1 : Only CopyAligned is used. + // kRepMovsBSize == 0 : Only RepMovsb is used. + // else CopyAligned is used to to kRepMovsBSize and then RepMovsb. + constexpr size_t kRepMovsBSize = -1; + if (count <= kRepMovsBSize) + return CopyAligned(dst, src, count); + CopyRepMovsb(dst, src, count); +} + +} // namespace __llvm_libc diff --git a/libc/test/src/string/CMakeLists.txt b/libc/test/src/string/CMakeLists.txt index 258937c7f4f610..43536e96d5525b 100644 --- a/libc/test/src/string/CMakeLists.txt +++ b/libc/test/src/string/CMakeLists.txt @@ -22,3 +22,24 @@ add_libc_unittest( DEPENDS strcpy ) + +# Tests all implementations of memcpy that can run on the host. +get_property(memcpy_implementations GLOBAL PROPERTY memcpy_implementations) +foreach(memcpy_config_name IN LISTS memcpy_implementations) + get_target_property(require_cpu_features ${memcpy_config_name} REQUIRE_CPU_FEATURES) + host_supports(can_run "${require_cpu_features}") + if(can_run) + add_libc_unittest( + ${memcpy_config_name}_test + SUITE + libc_string_unittests + SRCS + memcpy_test.cpp + DEPENDS + ${memcpy_config_name} + ) + else() + message(STATUS "Skipping test for '${memcpy_config_name}' insufficient host cpu features") + endif() +endforeach() + diff --git a/libc/test/src/string/memcpy_test.cpp b/libc/test/src/string/memcpy_test.cpp new file mode 100644 index 00000000000000..c83cdb60fc96f1 --- /dev/null +++ b/libc/test/src/string/memcpy_test.cpp @@ -0,0 +1,53 @@ +//===----------------------- Unittests for memcpy -------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "utils/CPP/ArrayRef.h" +#include "utils/UnitTest/Test.h" +#include "src/string/memcpy.h" + +using __llvm_libc::cpp::Array; +using __llvm_libc::cpp::ArrayRef; +using __llvm_libc::cpp::MutableArrayRef; +using Data = Array; + +static const ArrayRef kNumbers("0123456789", 10); +static const ArrayRef kDeadcode("DEADC0DE", 8); + +// Returns a Data object filled with a repetition of `filler`. +Data getData(ArrayRef filler) { + Data out; + for (size_t i = 0; i < out.size(); ++i) + out[i] = filler[i % filler.size()]; + return out; +} + +TEST(MemcpyTest, Thorough) { + const Data groundtruth = getData(kNumbers); + const Data dirty = getData(kDeadcode); + for (size_t count = 0; count < 1024; ++count) { + for (size_t align = 0; align < 64; ++align) { + auto buffer = dirty; + const char *const src = groundtruth.data(); + char *const dst = &buffer[align]; + __llvm_libc::memcpy(dst, src, count); + // Everything before copy is untouched. + for (size_t i = 0; i < align; ++i) + ASSERT_EQ(buffer[i], dirty[i]); + // Everything in between is copied. + for (size_t i = 0; i < count; ++i) + ASSERT_EQ(buffer[align + i], groundtruth[i]); + // Everything after copy is untouched. + for (size_t i = align + count; i < dirty.size(); ++i) + ASSERT_EQ(buffer[i], dirty[i]); + } + } +} + +// FIXME: Add tests with reads and writes on the boundary of a read/write +// protected page to check we're not reading nor writing prior/past the allowed +// regions. diff --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt index e3ec8eb40cf3e2..c02fa865fcaa03 100644 --- a/libc/test/src/string/memory_utils/CMakeLists.txt +++ b/libc/test/src/string/memory_utils/CMakeLists.txt @@ -4,7 +4,14 @@ add_libc_unittest( libc_string_unittests SRCS utils_test.cpp + memcpy_utils_test.cpp DEPENDS memory_utils standalone_cpp ) + +target_compile_definitions( + utils_test + PRIVATE + LLVM_LIBC_MEMCPY_MONITOR=memcpy_monitor +) diff --git a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp new file mode 100644 index 00000000000000..813e86fe65dbd1 --- /dev/null +++ b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp @@ -0,0 +1,208 @@ +//===-------------------- Unittests for memory_utils ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/string/memory_utils/memcpy_utils.h" +#include "utils/CPP/Array.h" +#include "utils/UnitTest/Test.h" + +#include +#include // uintptr_t + +#ifndef LLVM_LIBC_MEMCPY_MONITOR +#error LLVM_LIBC_MEMCPY_MONITOR must be defined for this test. +#endif + +namespace __llvm_libc { + +struct Buffer { + static constexpr size_t kMaxBuffer = 1024; + char buffer[kMaxBuffer + 1]; + size_t last = 0; + + void Clear() { + last = 0; + for (size_t i = 0; i < kMaxBuffer; ++i) + buffer[i] = '0'; + buffer[kMaxBuffer] = '\0'; + } + + void Increment(const void *ptr) { + const auto offset = reinterpret_cast(ptr); + assert(offset < kMaxBuffer); + ++buffer[offset]; + if (offset > last) + last = offset; + } + + char *Finish() { + assert(last < kMaxBuffer); + buffer[last + 1] = '\0'; + return buffer; + } +}; + +struct Trace { + Buffer read; + Buffer write; + + void Add(char *__restrict dst, const char *__restrict src, size_t count) { + for (size_t i = 0; i < count; ++i) + read.Increment(src + i); + for (size_t i = 0; i < count; ++i) + write.Increment(dst + i); + } + + void Clear() { + read.Clear(); + write.Clear(); + } + + char *Read() { return read.Finish(); } + char *Write() { return write.Finish(); } +}; + +static Trace &GetTrace() { + static thread_local Trace events; + return events; +} + +extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict dst, + const char *__restrict src, + size_t count) { + GetTrace().Add(dst, src, count); +} + +char *I(uintptr_t offset) { return reinterpret_cast(offset); } + +TEST(MemcpyUtilsTest, CopyTrivial) { + auto &trace = GetTrace(); + + trace.Clear(); + Copy<1>(I(0), I(0)); + EXPECT_STREQ(trace.Write(), "1"); + EXPECT_STREQ(trace.Read(), "1"); + + trace.Clear(); + Copy<2>(I(0), I(0)); + EXPECT_STREQ(trace.Write(), "11"); + EXPECT_STREQ(trace.Read(), "11"); + + trace.Clear(); + Copy<4>(I(0), I(0)); + EXPECT_STREQ(trace.Write(), "1111"); + EXPECT_STREQ(trace.Read(), "1111"); + + trace.Clear(); + Copy<8>(I(0), I(0)); + EXPECT_STREQ(trace.Write(), "11111111"); + EXPECT_STREQ(trace.Read(), "11111111"); + + trace.Clear(); + Copy<16>(I(0), I(0)); + EXPECT_STREQ(trace.Write(), "1111111111111111"); + EXPECT_STREQ(trace.Read(), "1111111111111111"); + + trace.Clear(); + Copy<32>(I(0), I(0)); + EXPECT_STREQ(trace.Write(), "11111111111111111111111111111111"); + EXPECT_STREQ(trace.Read(), "11111111111111111111111111111111"); + + trace.Clear(); + Copy<64>(I(0), I(0)); + EXPECT_STREQ( + trace.Write(), + "1111111111111111111111111111111111111111111111111111111111111111"); + EXPECT_STREQ( + trace.Read(), + "1111111111111111111111111111111111111111111111111111111111111111"); +} + +TEST(MemcpyUtilsTest, CopyOffset) { + auto &trace = GetTrace(); + + trace.Clear(); + Copy<1>(I(3), I(1)); + EXPECT_STREQ(trace.Write(), "0001"); + EXPECT_STREQ(trace.Read(), "01"); + + trace.Clear(); + Copy<1>(I(2), I(1)); + EXPECT_STREQ(trace.Write(), "001"); + EXPECT_STREQ(trace.Read(), "01"); +} + +TEST(MemcpyUtilsTest, CopyOverlap) { + auto &trace = GetTrace(); + + trace.Clear(); + CopyOverlap<2>(I(0), I(0), 2); + EXPECT_STREQ(trace.Write(), "22"); + EXPECT_STREQ(trace.Read(), "22"); + + trace.Clear(); + CopyOverlap<2>(I(0), I(0), 3); + EXPECT_STREQ(trace.Write(), "121"); + EXPECT_STREQ(trace.Read(), "121"); + + trace.Clear(); + CopyOverlap<2>(I(0), I(0), 4); + EXPECT_STREQ(trace.Write(), "1111"); + EXPECT_STREQ(trace.Read(), "1111"); + + trace.Clear(); + CopyOverlap<4>(I(2), I(1), 7); + EXPECT_STREQ(trace.Write(), "001112111"); + EXPECT_STREQ(trace.Read(), "01112111"); +} + +TEST(MemcpyUtilsTest, CopyAligned) { + auto &trace = GetTrace(); + // Destination is aligned already. + // "1111000000000" + // + "0000111100000" + // + "0000000011110" + // + "0000000001111" + // = "1111111112221" + trace.Clear(); + CopyAligned<4>(I(0), I(0), 13); + EXPECT_STREQ(trace.Write(), "1111111112221"); + EXPECT_STREQ(trace.Read(), "1111111112221"); + + // Misaligned destination + // "01111000000000" + // + "00001111000000" + // + "00000000111100" + // + "00000000001111" + // = "01112111112211" + trace.Clear(); + CopyAligned<4>(I(1), I(0), 13); + EXPECT_STREQ(trace.Write(), "01112111112211"); + EXPECT_STREQ(trace.Read(), "1112111112211"); +} + +TEST(MemcpyUtilsTest, MaxReloads) { + auto &trace = GetTrace(); + for (size_t alignment = 0; alignment < 32; ++alignment) { + for (size_t count = 64; count < 768; ++count) { + trace.Clear(); + // We should never reload more than twice when copying from count = 2x32. + CopyAligned<32>(I(alignment), I(0), count); + const char *const written = trace.Write(); + // First bytes are untouched. + for (size_t i = 0; i < alignment; ++i) + EXPECT_EQ(written[i], '0'); + // Next bytes are loaded once or twice but no more. + for (size_t i = alignment; i < count; ++i) { + EXPECT_GE(written[i], '1'); + EXPECT_LE(written[i], '2'); + } + } + } +} + +} // namespace __llvm_libc diff --git a/libc/test/src/string/memory_utils/utils_test.cpp b/libc/test/src/string/memory_utils/utils_test.cpp index 5a14cb1df8e497..c1564334c640bf 100644 --- a/libc/test/src/string/memory_utils/utils_test.cpp +++ b/libc/test/src/string/memory_utils/utils_test.cpp @@ -87,6 +87,14 @@ TEST(UtilsTest, OffsetToNextAligned) { EXPECT_EQ(offset_to_next_aligned<32>(forge(16)), I(16)); } +TEST(UtilsTest, OffsetFromLastAligned) { + EXPECT_EQ(offset_from_last_aligned<16>(forge(0)), I(0)); + EXPECT_EQ(offset_from_last_aligned<16>(forge(1)), I(1)); + EXPECT_EQ(offset_from_last_aligned<16>(forge(16)), I(0)); + EXPECT_EQ(offset_from_last_aligned<16>(forge(15)), I(15)); + EXPECT_EQ(offset_from_last_aligned<32>(forge(16)), I(16)); +} + TEST(UtilsTest, OffsetToNextCacheLine) { EXPECT_GT(LLVM_LIBC_CACHELINE_SIZE, 0); EXPECT_EQ(offset_to_next_cache_line(forge(0)), I(0)); diff --git a/libcxx/include/functional b/libcxx/include/functional index 63e3cbed046af9..b13992f94e2b0d 100644 --- a/libcxx/include/functional +++ b/libcxx/include/functional @@ -1618,7 +1618,7 @@ public: // __base provides an abstract interface for copyable functors. -template class __base; +template class _LIBCPP_TEMPLATE_VIS __base; template class __base<_Rp(_ArgTypes...)> diff --git a/libcxx/include/type_traits b/libcxx/include/type_traits index 6b8b855afc6505..8fdf4a4939d1d4 100644 --- a/libcxx/include/type_traits +++ b/libcxx/include/type_traits @@ -544,6 +544,18 @@ template using enable_if_t = typename enable_if<_Bp // is_same +#if __has_keyword(__is_same) + +template +struct _LIBCPP_TEMPLATE_VIS is_same : _BoolConstant<__is_same(_Tp, _Up)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_same_v = __is_same(_Tp, _Up); +#endif + +#else + template struct _LIBCPP_TEMPLATE_VIS is_same : public false_type {}; template struct _LIBCPP_TEMPLATE_VIS is_same<_Tp, _Tp> : public true_type {}; @@ -553,6 +565,8 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_same_v = is_same<_Tp, _Up>::value; #endif +#endif // __is_same + template using _IsSame = _BoolConstant< #ifdef __clang__ @@ -656,6 +670,18 @@ struct __two {char __lx[2];}; // is_const +#if __has_keyword(__is_const) + +template +struct _LIBCPP_TEMPLATE_VIS is_const : _BoolConstant<__is_const(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_const_v = __is_const(_Tp); +#endif + +#else + template struct _LIBCPP_TEMPLATE_VIS is_const : public false_type {}; template struct _LIBCPP_TEMPLATE_VIS is_const<_Tp const> : public true_type {}; @@ -665,8 +691,22 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_const_v = is_const<_Tp>::value; #endif +#endif // __has_keyword(__is_const) + // is_volatile +#if __has_keyword(__is_volatile) + +template +struct _LIBCPP_TEMPLATE_VIS is_volatile : _BoolConstant<__is_volatile(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_volatile_v = __is_volatile(_Tp); +#endif + +#else + template struct _LIBCPP_TEMPLATE_VIS is_volatile : public false_type {}; template struct _LIBCPP_TEMPLATE_VIS is_volatile<_Tp volatile> : public true_type {}; @@ -676,37 +716,87 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_volatile_v = is_volatile<_Tp>::value; #endif +#endif // __has_keyword(__is_volatile) + // remove_const +#if __has_keyword(__remove_const) + +template +struct _LIBCPP_TEMPLATE_VIS remove_const {typedef __remove_const(_Tp) type;}; + +#if _LIBCPP_STD_VER > 11 +template using remove_const_t = __remove_const(_Tp); +#endif + +#else + template struct _LIBCPP_TEMPLATE_VIS remove_const {typedef _Tp type;}; template struct _LIBCPP_TEMPLATE_VIS remove_const {typedef _Tp type;}; #if _LIBCPP_STD_VER > 11 template using remove_const_t = typename remove_const<_Tp>::type; #endif +#endif // __has_keyword(__remove_const) + // remove_volatile +#if __has_keyword(__remove_volatile) + +template +struct _LIBCPP_TEMPLATE_VIS remove_volatile {typedef __remove_volatile(_Tp) type;}; + +#if _LIBCPP_STD_VER > 11 +template using remove_volatile_t = __remove_volatile(_Tp); +#endif + +#else + template struct _LIBCPP_TEMPLATE_VIS remove_volatile {typedef _Tp type;}; template struct _LIBCPP_TEMPLATE_VIS remove_volatile {typedef _Tp type;}; #if _LIBCPP_STD_VER > 11 template using remove_volatile_t = typename remove_volatile<_Tp>::type; #endif +#endif // __has_keyword(__remove_volatile) + // remove_cv +#if __has_keyword(__remove_cv) + +template +struct _LIBCPP_TEMPLATE_VIS remove_cv {typedef __remove_cv(_Tp) type;}; + +#if _LIBCPP_STD_VER > 11 +template using remove_cv_t = __remove_cv(_Tp); +#endif + +#else + template struct _LIBCPP_TEMPLATE_VIS remove_cv {typedef typename remove_volatile::type>::type type;}; #if _LIBCPP_STD_VER > 11 template using remove_cv_t = typename remove_cv<_Tp>::type; #endif +#endif // __has_keyword(__remove_cv) + // is_void -template struct __libcpp_is_void : public false_type {}; -template <> struct __libcpp_is_void : public true_type {}; +#if __has_keyword(__is_void) + +template +struct _LIBCPP_TEMPLATE_VIS is_void : _BoolConstant<__is_void(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_void_v = __is_void(_Tp); +#endif + +#else template struct _LIBCPP_TEMPLATE_VIS is_void - : public __libcpp_is_void::type> {}; + : public is_same::type, void> {}; #if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) template @@ -714,6 +804,8 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_void_v = is_void<_Tp>::value; #endif +#endif // __has_keyword(__is_void) + // __is_nullptr_t template struct __is_nullptr_t_impl : public false_type {}; @@ -735,6 +827,18 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_null_pointer_v // is_integral +#if __has_keyword(__is_integral) + +template +struct _LIBCPP_TEMPLATE_VIS is_integral : _BoolConstant<__is_integral(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_integral_v = __is_integral(_Tp); +#endif + +#else + template struct _LIBCPP_TEMPLATE_VIS is_integral : public _BoolConstant<__libcpp_is_integral::type>::value> {}; @@ -744,8 +848,22 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_integral_v = is_integral<_Tp>::value; #endif +#endif // __has_keyword(__is_integral) + // is_floating_point +#if __has_keyword(__is_floating_point) + +template +struct _LIBCPP_TEMPLATE_VIS is_floating_point : _BoolConstant<__is_floating_point(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_floating_point_v = __is_floating_point(_Tp); +#endif + +#else + template struct __libcpp_is_floating_point : public false_type {}; template <> struct __libcpp_is_floating_point : public true_type {}; template <> struct __libcpp_is_floating_point : public true_type {}; @@ -760,8 +878,22 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_floating_point_v = is_floating_point<_Tp>::value; #endif +#endif // __has_keyword(__is_floating_point) + // is_array +#if __has_keyword(__is_array) + +template +struct _LIBCPP_TEMPLATE_VIS is_array : _BoolConstant<__is_array(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_array_v = __is_array(_Tp); +#endif + +#else + template struct _LIBCPP_TEMPLATE_VIS is_array : public false_type {}; template struct _LIBCPP_TEMPLATE_VIS is_array<_Tp[]> @@ -775,6 +907,8 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_array_v = is_array<_Tp>::value; #endif +#endif // __has_keyword(__is_array) + // is_pointer template struct __libcpp_is_pointer : public false_type {}; @@ -788,6 +922,18 @@ template struct __libcpp_remove_objc_qualifiers<_Tp __autoreleasing> template struct __libcpp_remove_objc_qualifiers<_Tp __unsafe_unretained> { typedef _Tp type; }; #endif +#if __has_keyword(__is_pointer) + +template +struct _LIBCPP_TEMPLATE_VIS is_pointer : _BoolConstant<__is_pointer(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_pointer_v = __is_pointer(_Tp); +#endif + +#else // __has_keyword(__is_pointer) + template struct _LIBCPP_TEMPLATE_VIS is_pointer : public __libcpp_is_pointer::type>::type> {}; @@ -797,8 +943,36 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_pointer_v = is_pointer<_Tp>::value; #endif +#endif // __has_keyword(__is_pointer) + // is_reference +#if __has_keyword(__is_lvalue_reference) && \ + __has_keyword(__is_rvalue_reference) && \ + __has_keyword(__is_reference) + +template +struct _LIBCPP_TEMPLATE_VIS is_lvalue_reference : _BoolConstant<__is_lvalue_reference(_Tp)> { }; + +template +struct _LIBCPP_TEMPLATE_VIS is_rvalue_reference : _BoolConstant<__is_rvalue_reference(_Tp)> { }; + +template +struct _LIBCPP_TEMPLATE_VIS is_reference : _BoolConstant<__is_reference(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_reference_v = __is_reference(_Tp); + +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_lvalue_reference_v = __is_lvalue_reference(_Tp); + +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_rvalue_reference_v = __is_rvalue_reference(_Tp); +#endif + +#else // __has_keyword(__is_lvalue_reference) && etc... + template struct _LIBCPP_TEMPLATE_VIS is_lvalue_reference : public false_type {}; template struct _LIBCPP_TEMPLATE_VIS is_lvalue_reference<_Tp&> : public true_type {}; @@ -822,6 +996,9 @@ template _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_rvalue_reference_v = is_rvalue_reference<_Tp>::value; #endif + +#endif // __has_keyword(__is_lvalue_reference) && etc... + // is_union #if __has_feature(is_union) || defined(_LIBCPP_COMPILER_GCC) @@ -902,6 +1079,19 @@ template struct __libcpp_is_member_pointer<_Tp _Up::*> { }; }; +#if __has_keyword(__is_member_function_pointer) + +template +struct _LIBCPP_TEMPLATE_VIS is_member_function_pointer + : _BoolConstant<__is_member_function_pointer(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_member_function_pointer_v + = __is_member_function_pointer(_Tp); +#endif + +#else // __has_keyword(__is_member_function_pointer) template struct _LIBCPP_TEMPLATE_VIS is_member_function_pointer : public _BoolConstant< __libcpp_is_member_pointer::type>::__is_func > {}; @@ -912,8 +1102,22 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_member_function_pointer_v = is_member_function_pointer<_Tp>::value; #endif +#endif // __has_keyword(__is_member_function_pointer) + // is_member_pointer +#if __has_keyword(__is_member_pointer) + +template +struct _LIBCPP_TEMPLATE_VIS is_member_pointer : _BoolConstant<__is_member_pointer(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_member_pointer_v = __is_member_pointer(_Tp); +#endif + +#else // __has_keyword(__is_member_pointer) + template struct _LIBCPP_TEMPLATE_VIS is_member_pointer : public _BoolConstant< __libcpp_is_member_pointer::type>::__is_member > {}; @@ -923,8 +1127,24 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_member_pointer_v = is_member_pointer<_Tp>::value; #endif +#endif // __has_keyword(__is_member_pointer) + // is_member_object_pointer +#if __has_keyword(__is_member_object_pointer) + +template +struct _LIBCPP_TEMPLATE_VIS is_member_object_pointer + : _BoolConstant<__is_member_object_pointer(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_member_object_pointer_v + = __is_member_object_pointer(_Tp); +#endif + +#else // __has_keyword(__is_member_object_pointer) + template struct _LIBCPP_TEMPLATE_VIS is_member_object_pointer : public _BoolConstant< __libcpp_is_member_pointer::type>::__is_obj > {}; @@ -934,6 +1154,8 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_member_object_pointer_v = is_member_object_pointer<_Tp>::value; #endif +#endif // __has_keyword(__is_member_object_pointer) + // is_enum #if __has_feature(is_enum) || defined(_LIBCPP_COMPILER_GCC) @@ -941,6 +1163,11 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_member_object_pointer_v template struct _LIBCPP_TEMPLATE_VIS is_enum : public integral_constant {}; +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_enum_v = __is_enum(_Tp); +#endif + #else template struct _LIBCPP_TEMPLATE_VIS is_enum @@ -955,16 +1182,28 @@ template struct _LIBCPP_TEMPLATE_VIS is_enum !is_class<_Tp>::value && !is_function<_Tp>::value > {}; -#endif - #if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) template _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_enum_v = is_enum<_Tp>::value; #endif +#endif // __has_feature(is_enum) || defined(_LIBCPP_COMPILER_GCC) + // is_arithmetic +#if __has_keyword(__is_arithmetic) + +template +struct _LIBCPP_TEMPLATE_VIS is_arithmetic : _BoolConstant<__is_arithmetic(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_arithmetic_v = __is_arithmetic(_Tp); +#endif + +#else // __has_keyword(__is_arithmetic) + template struct _LIBCPP_TEMPLATE_VIS is_arithmetic : public integral_constant::value || is_floating_point<_Tp>::value> {}; @@ -975,8 +1214,24 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_arithmetic_v = is_arithmetic<_Tp>::value; #endif +#endif // __has_keyword(__is_arithmetic) + // is_fundamental +// In clang 9 and lower, this builtin did not work for nullptr_t. Additionally, in C++03 mode, +// nullptr isn't defined by the compiler so, this builtin won't work. +#if __has_keyword(__is_fundamental) && _LIBCPP_CLANG_VER > 900 && !defined(_LIBCPP_CXX03_LANG) + +template +struct _LIBCPP_TEMPLATE_VIS is_fundamental : _BoolConstant<__is_fundamental(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_fundamental_v = __is_fundamental(_Tp); +#endif + +#else // __has_keyword(__is_fundamental) + template struct _LIBCPP_TEMPLATE_VIS is_fundamental : public integral_constant::value || __is_nullptr_t<_Tp>::value || @@ -988,8 +1243,23 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_fundamental_v = is_fundamental<_Tp>::value; #endif +#endif // __has_keyword(__is_fundamental) + // is_scalar +// >= 11 because in C++03 nullptr isn't actually nullptr +#if __has_keyword(__is_scalar) && !defined(_LIBCPP_CXX03_LANG) + +template +struct _LIBCPP_TEMPLATE_VIS is_scalar : _BoolConstant<__is_scalar(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_scalar_v = __is_scalar(_Tp); +#endif + +#else // __has_keyword(__is_scalar) + template struct __is_block : false_type {}; #if defined(_LIBCPP_HAS_EXTENSION_BLOCKS) template struct __is_block<_Rp (^)(_Args...)> : true_type {}; @@ -1011,8 +1281,22 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_scalar_v = is_scalar<_Tp>::value; #endif +#endif // __has_keyword(__is_scalar) + // is_object +#if __has_keyword(__is_object) + +template +struct _LIBCPP_TEMPLATE_VIS is_object : _BoolConstant<__is_object(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_object_v = __is_object(_Tp); +#endif + +#else // __has_keyword(__is_object) + template struct _LIBCPP_TEMPLATE_VIS is_object : public integral_constant::value || is_array<_Tp>::value || @@ -1025,8 +1309,23 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_object_v = is_object<_Tp>::value; #endif +#endif // __has_keyword(__is_object) + // is_compound +// >= 11 because in C++03 nullptr isn't actually nullptr +#if __has_keyword(__is_compound) && !defined(_LIBCPP_CXX03_LANG) + +template +struct _LIBCPP_TEMPLATE_VIS is_compound : _BoolConstant<__is_compound(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_compound_v = __is_compound(_Tp); +#endif + +#else // __has_keyword(__is_compound) + template struct _LIBCPP_TEMPLATE_VIS is_compound : public integral_constant::value> {}; @@ -1036,6 +1335,7 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_compound_v = is_compound<_Tp>::value; #endif +#endif // __has_keyword(__is_compound) // __is_referenceable [defns.referenceable] @@ -1080,6 +1380,13 @@ template using add_cv_t = typename add_cv<_Tp>::type; // remove_reference +#if __has_keyword(__remove_reference) + +template +struct _LIBCPP_TEMPLATE_VIS remove_reference { typedef __remove_reference(_Tp) type; }; + +#else // __has_keyword(__remove_reference) + template struct _LIBCPP_TEMPLATE_VIS remove_reference {typedef _LIBCPP_NODEBUG_TYPE _Tp type;}; template struct _LIBCPP_TEMPLATE_VIS remove_reference<_Tp&> {typedef _LIBCPP_NODEBUG_TYPE _Tp type;}; template struct _LIBCPP_TEMPLATE_VIS remove_reference<_Tp&&> {typedef _LIBCPP_NODEBUG_TYPE _Tp type;}; @@ -1088,6 +1395,8 @@ template struct _LIBCPP_TEMPLATE_VIS remove_reference<_Tp&&> {typede template using remove_reference_t = typename remove_reference<_Tp>::type; #endif +#endif // __has_keyword(__remove_reference) + // add_lvalue_reference template ::value> struct __add_lvalue_reference_impl { typedef _LIBCPP_NODEBUG_TYPE _Tp type; }; @@ -1195,6 +1504,19 @@ template using type_identity_t = typename type_identity<_Tp>::type; // is_signed +// In clang 9 and earlier, this builtin did not work for floating points or enums +#if __has_keyword(__is_signed) && _LIBCPP_CLANG_VER > 900 + +template +struct _LIBCPP_TEMPLATE_VIS is_signed : _BoolConstant<__is_signed(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_signed_v = __is_signed(_Tp); +#endif + +#else // __has_keyword(__is_signed) + template ::value> struct __libcpp_is_signed_impl : public _LIBCPP_BOOL_CONSTANT(_Tp(-1) < _Tp(0)) {}; @@ -1214,8 +1536,22 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_signed_v = is_signed<_Tp>::value; #endif +#endif // __has_keyword(__is_signed) + // is_unsigned +#if __has_keyword(__is_unsigned) + +template +struct _LIBCPP_TEMPLATE_VIS is_unsigned : _BoolConstant<__is_unsigned(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_unsigned_v = __is_unsigned(_Tp); +#endif + +#else // __has_keyword(__is_unsigned) + template ::value> struct __libcpp_is_unsigned_impl : public _LIBCPP_BOOL_CONSTANT(_Tp(0) < _Tp(-1)) {}; @@ -1235,6 +1571,8 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_unsigned_v = is_unsigned<_Tp>::value; #endif +#endif // __has_keyword(__is_unsigned) + // rank template struct _LIBCPP_TEMPLATE_VIS rank @@ -1252,6 +1590,19 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR size_t rank_v // extent +#if __has_keyword(__array_extent) + +template +struct _LIBCPP_TEMPLATE_VIS extent + : integral_constant { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR size_t extent_v = __array_extent(_Tp, _Ip); +#endif + +#else // __has_keyword(__array_extent) + template struct _LIBCPP_TEMPLATE_VIS extent : public integral_constant {}; template struct _LIBCPP_TEMPLATE_VIS extent<_Tp[], 0> @@ -1269,6 +1620,8 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR size_t extent_v = extent<_Tp, _Ip>::value; #endif +#endif // __has_keyword(__array_extent) + // remove_extent template struct _LIBCPP_TEMPLATE_VIS remove_extent @@ -2114,6 +2467,18 @@ template using common_type_t = typename common_type<_Tp...>::type template struct __select_2nd { typedef _LIBCPP_NODEBUG_TYPE _Tp type; }; +#if __has_keyword(__is_assignable) + +template +struct _LIBCPP_TEMPLATE_VIS is_assignable : _BoolConstant<__is_assignable(_Tp, _Up)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_assignable_v = __is_assignable(_Tp, _Arg); +#endif + +#else // __has_keyword(__is_assignable) + template typename __select_2nd() = _VSTD::declval<_Arg>())), true_type>::type __is_assignable_test(int); @@ -2142,6 +2507,8 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_assignable_v = is_assignable<_Tp, _Arg>::value; #endif +#endif // __has_keyword(__is_assignable) + // is_copy_assignable template struct _LIBCPP_TEMPLATE_VIS is_copy_assignable @@ -2168,6 +2535,18 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_move_assignable_v // is_destructible +#if __has_keyword(__is_destructible) + +template +struct _LIBCPP_TEMPLATE_VIS is_destructible : _BoolConstant<__is_destructible(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_destructible_v = __is_destructible(_Tp); +#endif + +#else // __has_keyword(__is_destructible) + // if it's a reference, return true // if it's a function, return false // if it's void, return false @@ -2230,6 +2609,8 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_destructible_v = is_destructible<_Tp>::value; #endif +#endif // __has_keyword(__is_destructible) + // move template @@ -3859,7 +4240,6 @@ struct underlying_type : __underlying_type_impl<_Tp, is_enum<_Tp>::value> {}; template using underlying_type_t = typename underlying_type<_Tp>::type; #endif - template ::value> struct __sfinae_underlying_type { diff --git a/libcxx/test/std/utilities/meta/meta.rel/is_same.pass.cpp b/libcxx/test/std/utilities/meta/meta.rel/is_same.pass.cpp index 739713bf4c2f24..dd83f224013179 100644 --- a/libcxx/test/std/utilities/meta/meta.rel/is_same.pass.cpp +++ b/libcxx/test/std/utilities/meta/meta.rel/is_same.pass.cpp @@ -50,6 +50,14 @@ void test_is_not_same() static_assert((!std::is_same::value), ""); } +template +struct OverloadTest +{ + void fn(std::is_same) { } + void fn(std::false_type) { } + void x() { fn(std::false_type()); } +}; + class Class { public: @@ -70,5 +78,8 @@ int main(int, char**) test_is_not_same(); test_is_not_same(); + OverloadTest t; + (void)t; + return 0; } diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp index 0a041202f2788a..92768bae0c839a 100644 --- a/lld/ELF/ScriptParser.cpp +++ b/lld/ELF/ScriptParser.cpp @@ -848,9 +848,9 @@ OutputSection *ScriptParser::readOutputSectionDescription(StringRef outSec) { // We handle the FILL command as an alias for =fillexp section attribute, // which is different from what GNU linkers do. // https://sourceware.org/binutils/docs/ld/Output-Section-Data.html - expect("("); + if (peek() != "(") + setError("( expected, but got " + peek()); cmd->filler = readFill(); - expect(")"); } else if (tok == "SORT") { readSort(); } else if (tok == "INCLUDE") { @@ -905,8 +905,11 @@ OutputSection *ScriptParser::readOutputSectionDescription(StringRef outSec) { // When reading a hexstring, ld.bfd handles it as a blob of arbitrary // size, while ld.gold always handles it as a 32-bit big-endian number. // We are compatible with ld.gold because it's easier to implement. +// Also, we require that expressions with operators must be wrapped into +// round brackets. We did it to resolve the ambiguity when parsing scripts like: +// SECTIONS { .foo : { ... } =120+3 /DISCARD/ : { ... } } std::array ScriptParser::readFill() { - uint64_t value = readExpr()().val; + uint64_t value = readPrimary()().val; if (value > UINT32_MAX) setError("filler expression result does not fit 32-bit: 0x" + Twine::utohexstr(value)); diff --git a/lld/test/ELF/linkerscript/sections-padding.s b/lld/test/ELF/linkerscript/sections-padding.s index 5ec0ddbe767a7f..4d147d79c63e6d 100644 --- a/lld/test/ELF/linkerscript/sections-padding.s +++ b/lld/test/ELF/linkerscript/sections-padding.s @@ -7,7 +7,7 @@ # RUN: llvm-objdump -s %t.out | FileCheck --check-prefix=YES %s # YES: 66000011 22000011 22000011 22000011 -# RUN: echo "SECTIONS { .mysec : { *(.mysec*) } =0x1100+0x22 }" > %t.script +# RUN: echo "SECTIONS { .mysec : { *(.mysec*) } =(0x1100+0x22) }" > %t.script # RUN: ld.lld -o %t.out --script %t.script %t # RUN: llvm-objdump -s %t.out | FileCheck --check-prefix=YES2 %s # YES2: 66000011 22000011 22000011 22000011 @@ -66,6 +66,11 @@ # RUN: not ld.lld -o /dev/null %t --script %t.script 2>&1 | FileCheck --check-prefix=ERR4 %s # ERR4: symbol not found: foo +## Check we are able to parse scripts where "/DISCARD/" follows a section fill expression. +# RUN: echo "SECTIONS { .mysec : { *(.mysec*) } =0x1122 /DISCARD/ : { *(.text) } }" > %t.script +# RUN: ld.lld -o %t.out --script %t.script %t +# RUN: llvm-objdump -s %t.out | FileCheck --check-prefix=YES %s + .section .mysec.1,"a" .align 16 .byte 0x66 diff --git a/lldb/include/lldb/DataFormatters/StringPrinter.h b/lldb/include/lldb/DataFormatters/StringPrinter.h index 6f8869cc2a1e37..5842cde893d89c 100644 --- a/lldb/include/lldb/DataFormatters/StringPrinter.h +++ b/lldb/include/lldb/DataFormatters/StringPrinter.h @@ -115,9 +115,15 @@ class StringPrinter { lldb::ProcessSP GetProcessSP() const { return m_process_sp; } + void SetHasSourceSize(bool e) { m_has_source_size = e; } + + bool HasSourceSize() const { return m_has_source_size; } + private: uint64_t m_location = 0; lldb::ProcessSP m_process_sp; + /// True iff we know the source size of the string. + bool m_has_source_size = false; }; class ReadBufferAndDumpToStreamOptions : public DumpToStreamOptions { diff --git a/lldb/packages/Python/lldbsuite/test/decorators.py b/lldb/packages/Python/lldbsuite/test/decorators.py index 32ae8ee9b000af..cc28ae9016346c 100644 --- a/lldb/packages/Python/lldbsuite/test/decorators.py +++ b/lldb/packages/Python/lldbsuite/test/decorators.py @@ -702,7 +702,7 @@ def is_compiler_clang_with_call_site_info(self): f = tempfile.NamedTemporaryFile() cmd = "echo 'int main() {}' | " \ - "%s -g -glldb -O1 -Xclang -femit-debug-entry-values -S -emit-llvm -x c -o %s -" % (compiler_path, f.name) + "%s -g -glldb -O1 -S -emit-llvm -x c -o %s -" % (compiler_path, f.name) if os.popen(cmd).close() is not None: return "Compiler can't compile with call site info enabled" diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py index f8f916036f9aab..966d460ea13d98 100644 --- a/lldb/packages/Python/lldbsuite/test/lldbtest.py +++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py @@ -1,7 +1,7 @@ """ LLDB module which provides the abstract base class of lldb test case. -The concrete subclass can override lldbtest.TesBase in order to inherit the +The concrete subclass can override lldbtest.TestBase in order to inherit the common behavior for unitest.TestCase.setUp/tearDown implemented in this file. The subclass should override the attribute mydir in order for the python runtime diff --git a/lldb/source/DataFormatters/StringPrinter.cpp b/lldb/source/DataFormatters/StringPrinter.cpp index 92dd71d17b8c1a..4515b67b2adfd4 100644 --- a/lldb/source/DataFormatters/StringPrinter.cpp +++ b/lldb/source/DataFormatters/StringPrinter.cpp @@ -525,27 +525,33 @@ static bool ReadUTFBufferAndDumpToStream( if (!options.GetStream()) return false; - uint32_t sourceSize = options.GetSourceSize(); + uint32_t sourceSize; bool needs_zero_terminator = options.GetNeedsZeroTermination(); bool is_truncated = false; const auto max_size = process_sp->GetTarget().GetMaximumSizeOfStringSummary(); - if (!sourceSize) { + if (options.HasSourceSize()) { + sourceSize = options.GetSourceSize(); + if (!options.GetIgnoreMaxLength()) { + if (sourceSize > max_size) { + sourceSize = max_size; + is_truncated = true; + } + } + } else { sourceSize = max_size; needs_zero_terminator = true; - } else if (!options.GetIgnoreMaxLength()) { - if (sourceSize > max_size) { - sourceSize = max_size; - is_truncated = true; - } } const int bufferSPSize = sourceSize * type_width; lldb::DataBufferSP buffer_sp(new DataBufferHeap(bufferSPSize, 0)); - if (!buffer_sp->GetBytes()) + // Check if we got bytes. We never get any bytes if we have an empty + // string, but we still continue so that we end up actually printing + // an empty string (""). + if (sourceSize != 0 && !buffer_sp->GetBytes()) return false; Status error; diff --git a/lldb/source/Expression/UserExpression.cpp b/lldb/source/Expression/UserExpression.cpp index 0243cc0c375058..5bd2321e48ddcb 100644 --- a/lldb/source/Expression/UserExpression.cpp +++ b/lldb/source/Expression/UserExpression.cpp @@ -259,6 +259,10 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx, // If there is a fixed expression, try to parse it: if (!parse_success) { + // Delete the expression that failed to parse before attempting to parse + // the next expression. + user_expression_sp.reset(); + execution_results = lldb::eExpressionParseError; if (fixed_expression && !fixed_expression->empty() && options.GetAutoApplyFixIts()) { diff --git a/lldb/source/Host/macosx/objcxx/Host.mm b/lldb/source/Host/macosx/objcxx/Host.mm index 2475338a37fd52..eba3060f8ec63a 100644 --- a/lldb/source/Host/macosx/objcxx/Host.mm +++ b/lldb/source/Host/macosx/objcxx/Host.mm @@ -1088,43 +1088,6 @@ static Status LaunchProcessPosixSpawn(const char *exe_path, return error; } -// posix_spawnattr_setbinpref_np appears to be an Apple extension per: -// http://www.unix.com/man-page/OSX/3/posix_spawnattr_setbinpref_np/ -#if !defined(__arm__) - - // Don't set the binpref if a shell was provided. After all, that's only - // going to affect what version of the shell - // is launched, not what fork of the binary is launched. We insert "arch - // --arch as part of the shell invocation - // to do that job on OSX. - - if (launch_info.GetShell() == FileSpec()) { - // We don't need to do this for ARM, and we really shouldn't now that we - // have multiple CPU subtypes and no posix_spawnattr call that allows us - // to set which CPU subtype to launch... - const ArchSpec &arch_spec = launch_info.GetArchitecture(); - cpu_type_t cpu = arch_spec.GetMachOCPUType(); - cpu_type_t sub = arch_spec.GetMachOCPUSubType(); - if (cpu != 0 && cpu != static_cast(UINT32_MAX) && - cpu != static_cast(LLDB_INVALID_CPUTYPE) && - !(cpu == 0x01000007 && sub == 8)) // If haswell is specified, don't try - // to set the CPU type or we will fail - { - size_t ocount = 0; - error.SetError(::posix_spawnattr_setbinpref_np(&attr, 1, &cpu, &ocount), - eErrorTypePOSIX); - if (error.Fail()) - LLDB_LOG(log, - "error: {0}, ::posix_spawnattr_setbinpref_np ( &attr, 1, " - "cpu_type = {1:x}, count => {2} )", - error, cpu, ocount); - - if (error.Fail() || ocount != 1) - return error; - } - } -#endif // !defined(__arm__) - const char *tmp_argv[2]; char *const *argv = const_cast( launch_info.GetArguments().GetConstArgumentVector()); diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp index 78f58754cc319c..b4af67ecee0dd1 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp @@ -259,6 +259,7 @@ bool lldb_private::formatters::LibStdcppStringSummaryProvider( if (error.Fail()) return false; options.SetSourceSize(size_of_data); + options.SetHasSourceSize(true); if (!StringPrinter::ReadStringAndDumpToStream< StringPrinter::StringElementType::UTF8>(options)) { @@ -319,6 +320,7 @@ bool lldb_private::formatters::LibStdcppWStringSummaryProvider( if (error.Fail()) return false; options.SetSourceSize(size_of_data); + options.SetHasSourceSize(true); options.SetPrefixToken("L"); switch (wchar_size) { diff --git a/lldb/source/Plugins/Language/ObjC/NSString.cpp b/lldb/source/Plugins/Language/ObjC/NSString.cpp index 65256dc7acbdde..7c4afb36b5883c 100644 --- a/lldb/source/Plugins/Language/ObjC/NSString.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSString.cpp @@ -170,6 +170,7 @@ bool lldb_private::formatters::NSStringSummaryProvider( options.SetStream(&stream); options.SetQuote('"'); options.SetSourceSize(explicit_length); + options.SetHasSourceSize(has_explicit_length); options.SetNeedsZeroTermination(false); options.SetIgnoreMaxLength(summary_options.GetCapping() == TypeSummaryCapping::eTypeSummaryUncapped); @@ -182,6 +183,7 @@ bool lldb_private::formatters::NSStringSummaryProvider( options.SetProcessSP(process_sp); options.SetStream(&stream); options.SetSourceSize(explicit_length); + options.SetHasSourceSize(has_explicit_length); options.SetNeedsZeroTermination(false); options.SetIgnoreMaxLength(summary_options.GetCapping() == TypeSummaryCapping::eTypeSummaryUncapped); @@ -199,6 +201,7 @@ bool lldb_private::formatters::NSStringSummaryProvider( options.SetStream(&stream); options.SetQuote('"'); options.SetSourceSize(explicit_length); + options.SetHasSourceSize(has_explicit_length); options.SetIgnoreMaxLength(summary_options.GetCapping() == TypeSummaryCapping::eTypeSummaryUncapped); options.SetLanguage(summary_options.GetLanguage()); @@ -221,6 +224,7 @@ bool lldb_private::formatters::NSStringSummaryProvider( options.SetStream(&stream); options.SetQuote('"'); options.SetSourceSize(explicit_length); + options.SetHasSourceSize(has_explicit_length); options.SetNeedsZeroTermination(!has_explicit_length); options.SetIgnoreMaxLength(summary_options.GetCapping() == TypeSummaryCapping::eTypeSummaryUncapped); @@ -241,6 +245,7 @@ bool lldb_private::formatters::NSStringSummaryProvider( options.SetStream(&stream); options.SetQuote('"'); options.SetSourceSize(explicit_length); + options.SetHasSourceSize(has_explicit_length); options.SetNeedsZeroTermination(!has_explicit_length); options.SetIgnoreMaxLength(summary_options.GetCapping() == TypeSummaryCapping::eTypeSummaryUncapped); @@ -263,6 +268,7 @@ bool lldb_private::formatters::NSStringSummaryProvider( options.SetProcessSP(process_sp); options.SetStream(&stream); options.SetSourceSize(explicit_length); + options.SetHasSourceSize(has_explicit_length); options.SetNeedsZeroTermination(!has_explicit_length); options.SetIgnoreMaxLength(summary_options.GetCapping() == TypeSummaryCapping::eTypeSummaryUncapped); @@ -286,6 +292,7 @@ bool lldb_private::formatters::NSStringSummaryProvider( options.SetProcessSP(process_sp); options.SetStream(&stream); options.SetSourceSize(explicit_length); + options.SetHasSourceSize(has_explicit_length); options.SetIgnoreMaxLength(summary_options.GetCapping() == TypeSummaryCapping::eTypeSummaryUncapped); options.SetLanguage(summary_options.GetLanguage()); diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp index 9b3dbb166b687d..9fea9a217dce59 100644 --- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp +++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp @@ -1191,33 +1191,33 @@ AppleObjCRuntimeV2::GetClassDescriptor(ValueObject &valobj) { // if we get an invalid VO (which might still happen when playing around with // pointers returned by the expression parser, don't consider this a valid // ObjC object) - if (valobj.GetCompilerType().IsValid()) { - addr_t isa_pointer = valobj.GetPointerValue(); + if (!valobj.GetCompilerType().IsValid()) + return objc_class_sp; + addr_t isa_pointer = valobj.GetPointerValue(); - // tagged pointer - if (IsTaggedPointer(isa_pointer)) { - return m_tagged_pointer_vendor_up->GetClassDescriptor(isa_pointer); - } else { - ExecutionContext exe_ctx(valobj.GetExecutionContextRef()); + // tagged pointer + if (IsTaggedPointer(isa_pointer)) + return m_tagged_pointer_vendor_up->GetClassDescriptor(isa_pointer); + ExecutionContext exe_ctx(valobj.GetExecutionContextRef()); - Process *process = exe_ctx.GetProcessPtr(); - if (process) { - Status error; - ObjCISA isa = process->ReadPointerFromMemory(isa_pointer, error); - if (isa != LLDB_INVALID_ADDRESS) { - objc_class_sp = GetClassDescriptorFromISA(isa); - if (isa && !objc_class_sp) { - Log *log(GetLogIfAnyCategoriesSet(LIBLLDB_LOG_PROCESS | - LIBLLDB_LOG_TYPES)); - LLDB_LOGF(log, - "0x%" PRIx64 - ": AppleObjCRuntimeV2::GetClassDescriptor() ISA was " - "not in class descriptor cache 0x%" PRIx64, - isa_pointer, isa); - } - } - } - } + Process *process = exe_ctx.GetProcessPtr(); + if (!process) + return objc_class_sp; + + Status error; + ObjCISA isa = process->ReadPointerFromMemory(isa_pointer, error); + if (isa == LLDB_INVALID_ADDRESS) + return objc_class_sp; + + objc_class_sp = GetClassDescriptorFromISA(isa); + if (isa && !objc_class_sp) { + Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS | + LIBLLDB_LOG_TYPES)); + LLDB_LOGF(log, + "0x%" PRIx64 + ": AppleObjCRuntimeV2::GetClassDescriptor() ISA was " + "not in class descriptor cache 0x%" PRIx64, + isa_pointer, isa); } return objc_class_sp; } diff --git a/lldb/source/Plugins/MemoryHistory/asan/MemoryHistoryASan.cpp b/lldb/source/Plugins/MemoryHistory/asan/MemoryHistoryASan.cpp index 4b9da8f76fd247..333113a0b17a50 100644 --- a/lldb/source/Plugins/MemoryHistory/asan/MemoryHistoryASan.cpp +++ b/lldb/source/Plugins/MemoryHistory/asan/MemoryHistoryASan.cpp @@ -138,7 +138,12 @@ static void CreateHistoryThreadFromValueObject(ProcessSP process_sp, pcs.push_back(pc); } - HistoryThread *history_thread = new HistoryThread(*process_sp, tid, pcs); + // The ASAN runtime already massages the return addresses into call + // addresses, we don't want LLDB's unwinder to try to locate the previous + // instruction again as this might lead to us reporting a different line. + bool pcs_are_call_addresses = true; + HistoryThread *history_thread = + new HistoryThread(*process_sp, tid, pcs, pcs_are_call_addresses); ThreadSP new_thread_sp(history_thread); std::ostringstream thread_name_with_number; thread_name_with_number << thread_name << " Thread " << tid; diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp index aa1f8994ecb66f..46dd3774e5a987 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp @@ -1829,6 +1829,21 @@ lldb_private::Status PlatformDarwin::FindBundleBinaryInExecSearchPaths( return Status(); } +std::string PlatformDarwin::FindComponentInPath(llvm::StringRef path, + llvm::StringRef component) { + auto begin = llvm::sys::path::begin(path); + auto end = llvm::sys::path::end(path); + for (auto it = begin; it != end; ++it) { + if (it->contains(component)) { + llvm::SmallString<128> buffer; + llvm::sys::path::append(buffer, begin, ++it, + llvm::sys::path::Style::posix); + return buffer.str().str(); + } + } + return {}; +} + std::string PlatformDarwin::FindXcodeContentsDirectoryInPath(llvm::StringRef path) { auto begin = llvm::sys::path::begin(path); @@ -1959,3 +1974,15 @@ FileSpec PlatformDarwin::GetXcodeContentsDirectory() { }); return g_xcode_contents_path; } + +FileSpec PlatformDarwin::GetCurrentToolchainDirectory() { + if (FileSpec fspec = HostInfo::GetShlibDir()) + return FileSpec(FindComponentInPath(fspec.GetPath(), ".xctoolchain")); + return {}; +} + +FileSpec PlatformDarwin::GetCurrentCommandLineToolsDirectory() { + if (FileSpec fspec = HostInfo::GetShlibDir()) + return FileSpec(FindComponentInPath(fspec.GetPath(), "CommandLineTools")); + return {}; +} diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h index d385712db8e633..6d51edbc92945a 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h @@ -100,6 +100,13 @@ class PlatformDarwin : public PlatformPOSIX { static lldb_private::FileSpec GetXcodeSDK(SDKType type); static lldb_private::FileSpec GetXcodeContentsDirectory(); + /// Return the toolchain directroy the current LLDB instance is located in. + static lldb_private::FileSpec GetCurrentToolchainDirectory(); + + /// Return the command line tools directory the current LLDB instance is + /// located in. + static lldb_private::FileSpec GetCurrentCommandLineToolsDirectory(); + protected: struct CrashInfoAnnotations { uint64_t version; // unsigned long @@ -172,6 +179,8 @@ class PlatformDarwin : public PlatformPOSIX { const lldb_private::FileSpecList *module_search_paths_ptr, lldb::ModuleSP *old_module_sp_ptr, bool *did_create_ptr); + static std::string FindComponentInPath(llvm::StringRef path, + llvm::StringRef component); static std::string FindXcodeContentsDirectoryInPath(llvm::StringRef path); std::string m_developer_directory; diff --git a/lldb/source/Plugins/Process/Utility/HistoryThread.cpp b/lldb/source/Plugins/Process/Utility/HistoryThread.cpp index 815883d9e2f642..0649cd2f07de1b 100644 --- a/lldb/source/Plugins/Process/Utility/HistoryThread.cpp +++ b/lldb/source/Plugins/Process/Utility/HistoryThread.cpp @@ -25,12 +25,13 @@ using namespace lldb_private; // Constructor HistoryThread::HistoryThread(lldb_private::Process &process, lldb::tid_t tid, - std::vector pcs) + std::vector pcs, + bool pcs_are_call_addresses) : Thread(process, tid, true), m_framelist_mutex(), m_framelist(), m_pcs(pcs), m_extended_unwind_token(LLDB_INVALID_ADDRESS), m_queue_name(), m_thread_name(), m_originating_unique_thread_id(tid), m_queue_id(LLDB_INVALID_QUEUE_ID) { - m_unwinder_up.reset(new HistoryUnwind(*this, pcs)); + m_unwinder_up.reset(new HistoryUnwind(*this, pcs, pcs_are_call_addresses)); Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_OBJECT)); LLDB_LOGF(log, "%p HistoryThread::HistoryThread", static_cast(this)); } diff --git a/lldb/source/Plugins/Process/Utility/HistoryThread.h b/lldb/source/Plugins/Process/Utility/HistoryThread.h index 434cf6af719758..a66e0f2d4207c9 100644 --- a/lldb/source/Plugins/Process/Utility/HistoryThread.h +++ b/lldb/source/Plugins/Process/Utility/HistoryThread.h @@ -33,7 +33,8 @@ namespace lldb_private { class HistoryThread : public lldb_private::Thread { public: HistoryThread(lldb_private::Process &process, lldb::tid_t tid, - std::vector pcs); + std::vector pcs, + bool pcs_are_call_addresses = false); ~HistoryThread() override; diff --git a/lldb/source/Plugins/Process/Utility/HistoryUnwind.cpp b/lldb/source/Plugins/Process/Utility/HistoryUnwind.cpp index 93fcde72bf993a..9b9522955de940 100644 --- a/lldb/source/Plugins/Process/Utility/HistoryUnwind.cpp +++ b/lldb/source/Plugins/Process/Utility/HistoryUnwind.cpp @@ -23,8 +23,10 @@ using namespace lldb_private; // Constructor -HistoryUnwind::HistoryUnwind(Thread &thread, std::vector pcs) - : Unwind(thread), m_pcs(pcs) {} +HistoryUnwind::HistoryUnwind(Thread &thread, std::vector pcs, + bool pcs_are_call_addresses) + : Unwind(thread), m_pcs(pcs), + m_pcs_are_call_addresses(pcs_are_call_addresses) {} // Destructor @@ -59,7 +61,10 @@ bool HistoryUnwind::DoGetFrameInfoAtIndex(uint32_t frame_idx, lldb::addr_t &cfa, if (frame_idx < m_pcs.size()) { cfa = frame_idx; pc = m_pcs[frame_idx]; - behaves_like_zeroth_frame = (frame_idx == 0); + if (m_pcs_are_call_addresses) + behaves_like_zeroth_frame = true; + else + behaves_like_zeroth_frame = (frame_idx == 0); return true; } return false; diff --git a/lldb/source/Plugins/Process/Utility/HistoryUnwind.h b/lldb/source/Plugins/Process/Utility/HistoryUnwind.h index b15abd92447590..cb72b5d0a17648 100644 --- a/lldb/source/Plugins/Process/Utility/HistoryUnwind.h +++ b/lldb/source/Plugins/Process/Utility/HistoryUnwind.h @@ -18,7 +18,8 @@ namespace lldb_private { class HistoryUnwind : public lldb_private::Unwind { public: - HistoryUnwind(Thread &thread, std::vector pcs); + HistoryUnwind(Thread &thread, std::vector pcs, + bool pcs_are_call_addresses = false); ~HistoryUnwind() override; @@ -35,6 +36,9 @@ class HistoryUnwind : public lldb_private::Unwind { private: std::vector m_pcs; + /// This boolean indicates that the PCs in the non-0 frames are call + /// addresses and not return addresses. + bool m_pcs_are_call_addresses; }; } // namespace lldb_private diff --git a/lldb/test/API/commands/expression/import-std-module/empty-module/TestEmptyStdModule.py b/lldb/test/API/commands/expression/import-std-module/empty-module/TestEmptyStdModule.py index 76e79df5cd1c8f..2b1cb100a32513 100644 --- a/lldb/test/API/commands/expression/import-std-module/empty-module/TestEmptyStdModule.py +++ b/lldb/test/API/commands/expression/import-std-module/empty-module/TestEmptyStdModule.py @@ -15,6 +15,7 @@ class ImportStdModule(TestBase): # but we still add the libc++ category so that this test is only run in # test configurations where libc++ is actually supposed to be tested. @add_test_categories(["libc++"]) + @skipIfRemote @skipIf(compiler=no_match("clang")) def test(self): self.build() diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_locations/main.c b/lldb/test/API/functionalities/breakpoint/breakpoint_locations/main.c index 7ec3ded67b74f6..f6ccb031c74455 100644 --- a/lldb/test/API/functionalities/breakpoint/breakpoint_locations/main.c +++ b/lldb/test/API/functionalities/breakpoint/breakpoint_locations/main.c @@ -14,9 +14,9 @@ func_inlined (void) { static int func_inline_call_count = 0; printf ("Called func_inlined.\n"); - ++func_inline_call_count; + ++func_inline_call_count; // Set break point at this line. printf ("Returning func_inlined call count: %d.\n", func_inline_call_count); - return func_inline_call_count; // Set break point at this line. + return func_inline_call_count; } extern int func_inlined (void); diff --git a/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/hardware_breakpoint_on_multiple_threads/TestHWBreakMultiThread.py b/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/hardware_breakpoint_on_multiple_threads/TestHWBreakMultiThread.py index f778b8e39e72ce..a4a1d9effbe191 100644 --- a/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/hardware_breakpoint_on_multiple_threads/TestHWBreakMultiThread.py +++ b/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/hardware_breakpoint_on_multiple_threads/TestHWBreakMultiThread.py @@ -32,19 +32,21 @@ def test_hw_break_set_disable_multi_thread_linux(self): self.setTearDownCleanup() self.break_multi_thread('disable', False) # llvm.org/PR44659 - # LLDB on darwin supports hardware breakpoints for arm, aarch64, x86_64 and - # i386 architectures. + # LLDB on darwin supports hardware breakpoints for x86_64 and i386 + # architectures. @skipUnlessDarwin @skipIfOutOfTreeDebugserver + @skipIfDarwinEmbedded def test_hw_break_set_delete_multi_thread_macos(self): self.build() self.setTearDownCleanup() self.break_multi_thread('delete') - # LLDB on darwin supports hardware breakpoints for arm, aarch64, x86_64 and - # i386 architectures. + # LLDB on darwin supports hardware breakpoints for x86_64 and i386 + # architectures. @skipUnlessDarwin @skipIfOutOfTreeDebugserver + @skipIfDarwinEmbedded def test_hw_break_set_disable_multi_thread_macos(self): self.build() self.setTearDownCleanup() diff --git a/lldb/test/API/functionalities/breakpoint/require_hw_breakpoints/TestRequireHWBreakpoints.py b/lldb/test/API/functionalities/breakpoint/require_hw_breakpoints/TestRequireHWBreakpoints.py index 74f2fbb0c1a018..61e41711310139 100644 --- a/lldb/test/API/functionalities/breakpoint/require_hw_breakpoints/TestRequireHWBreakpoints.py +++ b/lldb/test/API/functionalities/breakpoint/require_hw_breakpoints/TestRequireHWBreakpoints.py @@ -19,7 +19,6 @@ def supports_hw_breakpoints(self): CURRENT_EXECUTABLE_SET) self.runCmd("breakpoint set -b main --hardware") self.runCmd("run") - print(self.res.GetOutput()) if 'stopped' in self.res.GetOutput(): return 'Hardware breakpoints are supported' return None diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/nsstring/TestDataFormatterNSString.py b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/nsstring/TestDataFormatterNSString.py index 4ef0a5957503fb..5b323f5614b217 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/nsstring/TestDataFormatterNSString.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/nsstring/TestDataFormatterNSString.py @@ -76,8 +76,8 @@ def rdar11106605_commands(self): self.expect('frame variable hebrew', substrs=['לילה טוב']) def nsstring_data_formatter_commands(self): - self.expect('frame variable str0 str1 str2 str3 str4 str5 str6 str8 str9 str10 str11 label1 label2 processName str12', - substrs=[ + self.expect('frame variable empty str0 str1 str2 str3 str4 str5 str6 str8 str9 str10 str11 label1 label2 processName str12', + substrs=['(NSString *) empty = ', ' @""', # '(NSString *) str0 = ',' @"255"', '(NSString *) str1 = ', ' @"A rather short ASCII NSString object is here"', '(NSString *) str2 = ', ' @"A rather short UTF8 NSString object is here"', @@ -104,6 +104,8 @@ def nsstring_data_formatter_commands(self): self.expect('expr -d run-target -- path', substrs=['usr/blah/stuff']) self.expect('frame variable path', substrs=['usr/blah/stuff']) + self.expect('expr -d run-target -- empty_path', substrs=['@""']) + self.expect('frame variable empty_path', substrs=['@""']) def nsstring_withNULs_commands(self): """Check that the NSString formatter supports embedded NULs in the text""" diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/nsstring/main.m b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/nsstring/main.m index 576e091db1bc06..0787561e4da39e 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/nsstring/main.m +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/nsstring/main.m @@ -17,6 +17,7 @@ int main (int argc, const char * argv[]) NSAutoreleasePool * pool = [[NSAutoreleasePool alloc] init]; + NSString *empty = @""; NSString *str0 = [[NSNumber numberWithUnsignedLongLong:0xFF] stringValue]; NSString *str1 = [NSString stringWithCString:"A rather short ASCII NSString object is here" encoding:NSASCIIStringEncoding]; NSString *str2 = [NSString stringWithUTF8String:"A rather short UTF8 NSString object is here"]; @@ -69,6 +70,7 @@ int main (int argc, const char * argv[]) NSArray *components = @[@"usr", @"blah", @"stuff"]; NSString *path = [NSString pathWithComponents: components]; + NSString *empty_path = [empty stringByDeletingPathExtension]; const unichar someOfTheseAreNUL[] = {'a',' ', 'v','e','r','y',' ', 'm','u','c','h',' ','b','o','r','i','n','g',' ','t','a','s','k', diff --git a/lldb/test/API/functionalities/inline-stepping/TestInlineStepping.py b/lldb/test/API/functionalities/inline-stepping/TestInlineStepping.py index 40e29e614ad6de..8e84566d9f691f 100644 --- a/lldb/test/API/functionalities/inline-stepping/TestInlineStepping.py +++ b/lldb/test/API/functionalities/inline-stepping/TestInlineStepping.py @@ -16,9 +16,6 @@ class TestInlineStepping(TestBase): @expectedFailureAll( compiler="icc", bugnumber="# Not really a bug. ICC combines two inlined functions.") - @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr32343") - @expectedFailureAll(archs=["aarch64"], oslist=["linux"], - bugnumber="llvm.org/pr44057") def test_with_python_api(self): """Test stepping over and into inlined functions.""" self.build() diff --git a/lldb/test/API/functionalities/inline-stepping/calling.cpp b/lldb/test/API/functionalities/inline-stepping/calling.cpp index 9982fbf42734f0..49179ce7c97883 100644 --- a/lldb/test/API/functionalities/inline-stepping/calling.cpp +++ b/lldb/test/API/functionalities/inline-stepping/calling.cpp @@ -75,7 +75,7 @@ caller_trivial_1 () void caller_trivial_2 () { - inline_trivial_1 (); // In caller_trivial_2. + asm volatile ("nop"); inline_trivial_1 (); // In caller_trivial_2. inline_value += 1; // At increment in caller_trivial_2. } @@ -88,7 +88,7 @@ called_by_inline_trivial () void inline_trivial_1 () { - inline_trivial_2(); // In inline_trivial_1. + asm volatile ("nop"); inline_trivial_2(); // In inline_trivial_1. inline_value += 1; // At increment in inline_trivial_1. } diff --git a/lldb/test/API/functionalities/param_entry_vals/basic_entry_values_x86_64/Makefile b/lldb/test/API/functionalities/param_entry_vals/basic_entry_values_x86_64/Makefile index db8fa57abb910f..ab505a68412623 100644 --- a/lldb/test/API/functionalities/param_entry_vals/basic_entry_values_x86_64/Makefile +++ b/lldb/test/API/functionalities/param_entry_vals/basic_entry_values_x86_64/Makefile @@ -1,3 +1,3 @@ CXX_SOURCES := main.cpp -CXXFLAGS_EXTRAS := -O2 -glldb -Xclang -femit-debug-entry-values +CXXFLAGS_EXTRAS := -O2 -glldb include Makefile.rules diff --git a/lldb/test/API/functionalities/process_crash_info/TestProcessCrashInfo.py b/lldb/test/API/functionalities/process_crash_info/TestProcessCrashInfo.py index d0f47de83eea4c..6ef5018204fd8c 100644 --- a/lldb/test/API/functionalities/process_crash_info/TestProcessCrashInfo.py +++ b/lldb/test/API/functionalities/process_crash_info/TestProcessCrashInfo.py @@ -69,6 +69,8 @@ def test_api(self): self.assertIn("pointer being freed was not allocated", stream.GetData()) + # dyld leaves permanent crash_info records when testing on device. + @skipIfDarwinEmbedded def test_on_sane_process(self): """Test that lldb doesn't fetch the extended crash information dictionnary from a 'sane' stopped process.""" diff --git a/lldb/test/API/lang/cpp/thread_local/TestThreadLocal.py b/lldb/test/API/lang/cpp/thread_local/TestThreadLocal.py index 5152c0010d1021..e7cfa1ca14f278 100644 --- a/lldb/test/API/lang/cpp/thread_local/TestThreadLocal.py +++ b/lldb/test/API/lang/cpp/thread_local/TestThreadLocal.py @@ -1,6 +1,49 @@ -from lldbsuite.test import lldbinline from lldbsuite.test import decorators -lldbinline.MakeInlineTest(__file__, globals(), - lldbinline.expectedFailureAll(oslist=[ - "windows", "linux", "netbsd"])) +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil +from lldbsuite.test import lldbtest + + +class PlatformProcessCrashInfoTestCase(TestBase): + + mydir = TestBase.compute_mydir(__file__) + + @expectedFailureAll(oslist=["windows", "linux", "netbsd"]) + def test_thread_local(self): + # Set a breakpoint on the first instruction of the main function, + # before the TLS initialization has run. + self.build() + exe = self.getBuildArtifact("a.out") + + (target, process, _, _) = \ + lldbutil.run_to_source_breakpoint(self, "Set breakpoint here", + lldb.SBFileSpec("main.cpp")) + self.expect_expr("tl_local_int + 1", + result_type="int", result_value="323") + self.expect_expr("*tl_local_ptr + 2", + result_type="int", result_value="324") + self.expect_expr("tl_global_int", + result_type="int", result_value="123") + self.expect_expr("*tl_global_ptr", + result_type="int", result_value="45") + + # Now see if we emit the correct error when the TLS is not yet + # initialized. Let's set a breakpoint on the first instruction + # of main. + main_module = target.FindModule(lldb.SBFileSpec(exe)) + main_address = main_module.FindSymbol("main").GetStartAddress() + main_bkpt = target.BreakpointCreateBySBAddress(main_address) + + process.Kill() + lldbutil.run_to_breakpoint_do_run(self, target, main_bkpt) + + self.expect("expr tl_local_int", error=True, + substrs=["couldn't get the value of variable tl_local_int", + "No TLS data currently exists for this thread"]) + self.expect("expr *tl_local_ptr", error=True, + substrs=["couldn't get the value of variable tl_local_ptr", + "No TLS data currently exists for this thread"]) + diff --git a/lldb/test/API/lang/cpp/thread_local/main.cpp b/lldb/test/API/lang/cpp/thread_local/main.cpp index 1855b7c5f34411..04c7fc0ed74de9 100644 --- a/lldb/test/API/lang/cpp/thread_local/main.cpp +++ b/lldb/test/API/lang/cpp/thread_local/main.cpp @@ -3,15 +3,9 @@ thread_local int tl_global_int = 123; thread_local int *tl_global_ptr = &storage; int main(int argc, char **argv) { - //% self.expect("expr tl_local_int", error=True, substrs=["couldn't get the value of variable tl_local_int"]) - //% self.expect("expr *tl_local_ptr", error=True, substrs=["couldn't get the value of variable tl_local_ptr"]) thread_local int tl_local_int = 321; thread_local int *tl_local_ptr = nullptr; tl_local_ptr = &tl_local_int; tl_local_int++; - //% self.expect("expr tl_local_int + 1", substrs=["int", "= 323"]) - //% self.expect("expr *tl_local_ptr + 2", substrs=["int", "= 324"]) - //% self.expect("expr tl_global_int", substrs=["int", "= 123"]) - //% self.expect("expr *tl_global_ptr", substrs=["int", "= 45"]) - return 0; + return 0; // Set breakpoint here } diff --git a/lldb/test/API/lang/objc/hidden-ivars/Makefile b/lldb/test/API/lang/objc/hidden-ivars/Makefile index 0664769456eff4..283e8a118fb16a 100644 --- a/lldb/test/API/lang/objc/hidden-ivars/Makefile +++ b/lldb/test/API/lang/objc/hidden-ivars/Makefile @@ -4,4 +4,24 @@ OBJC_SOURCES := main.m LD_EXTRAS = -framework Foundation +all: a.out libInternalDefiner.dylib stripped + include Makefile.rules + +ifeq "$(MAKE_DSYM)" "YES" +stripped: a.out.dSYM +endif + +stripped: a.out libInternalDefiner.dylib + mkdir stripped + strip -Sx a.out -o stripped/a.out + strip -Sx libInternalDefiner.dylib -o stripped/libInternalDefiner.dylib +ifneq "$(CODESIGN)" "" + $(CODESIGN) -fs - stripped/a.out +endif +ifneq "$(CODESIGN)" "" + $(CODESIGN) -fs - stripped/libInternalDefiner.dylib +endif +ifeq "$(MAKE_DSYM)" "YES" + cp -r a.out.dSYM stripped/a.out.dSYM +endif diff --git a/lldb/test/API/lang/objc/hidden-ivars/TestHiddenIvars.py b/lldb/test/API/lang/objc/hidden-ivars/TestHiddenIvars.py index 03a325ac49c620..5930ffdc958aae 100644 --- a/lldb/test/API/lang/objc/hidden-ivars/TestHiddenIvars.py +++ b/lldb/test/API/lang/objc/hidden-ivars/TestHiddenIvars.py @@ -80,20 +80,11 @@ def test_frame_variable_across_modules(self): def common_setup(self, strip): if strip: - self.assertTrue(subprocess.call( - ['/usr/bin/strip', '-Sx', - self.getBuildArtifact('libInternalDefiner.dylib')]) == 0, - 'stripping dylib succeeded') - self.assertTrue(subprocess.call( - ['/bin/rm', '-rf', - self.getBuildArtifact('libInternalDefiner.dylib.dSYM')]) == 0, - 'remove dylib dSYM file succeeded') - self.assertTrue(subprocess.call(['/usr/bin/strip', '-Sx', - self.getBuildArtifact("a.out") - ]) == 0, - 'stripping a.out succeeded') + exe = self.getBuildArtifact("stripped/a.out") + else: + exe = self.getBuildArtifact("a.out") # Create a target by the debugger. - target = self.dbg.CreateTarget(self.getBuildArtifact("a.out")) + target = self.dbg.CreateTarget(exe) self.assertTrue(target, VALID_TARGET) # Create the breakpoint inside function 'main'. @@ -110,7 +101,6 @@ def common_setup(self, strip): None, environment, self.get_process_working_directory()) self.assertTrue(process, PROCESS_IS_VALID) - exe = self.getBuildArtifact("a.out") self.runCmd("file " + exe, CURRENT_EXECUTABLE_SET) # Break inside the foo function which takes a bar_ptr argument. diff --git a/lldb/test/API/lang/objc/objc-ivar-stripped/Makefile b/lldb/test/API/lang/objc/objc-ivar-stripped/Makefile index 0aaa021132e169..8b63215d6d9da6 100644 --- a/lldb/test/API/lang/objc/objc-ivar-stripped/Makefile +++ b/lldb/test/API/lang/objc/objc-ivar-stripped/Makefile @@ -3,7 +3,10 @@ LD_EXTRAS := -lobjc -framework Foundation all: a.out.stripped +include Makefile.rules + a.out.stripped: a.out.dSYM strip -o a.out.stripped a.out - -include Makefile.rules +ifneq "$(CODESIGN)" "" + $(CODESIGN) -fs - a.out.stripped +endif diff --git a/lldb/test/API/linux/builtin_trap/TestBuiltinTrap.py b/lldb/test/API/linux/builtin_trap/TestBuiltinTrap.py index 22de873e29fade..added4ef508a7c 100644 --- a/lldb/test/API/linux/builtin_trap/TestBuiltinTrap.py +++ b/lldb/test/API/linux/builtin_trap/TestBuiltinTrap.py @@ -23,7 +23,7 @@ def setUp(self): # gcc generates incorrect linetable @expectedFailureAll(archs="arm", compiler="gcc", triple=".*-android") - @expectedFailureAll(oslist=['linux'], archs=['arm', 'aarch64']) + @expectedFailureAll(archs=['arm', 'aarch64']) @skipIfWindows def test_with_run_command(self): """Test that LLDB handles a function with __builtin_trap correctly.""" diff --git a/lldb/tools/debugserver/debugserver.xcodeproj/project.pbxproj b/lldb/tools/debugserver/debugserver.xcodeproj/project.pbxproj index f4267b7633a279..1c7a55f7108a0c 100644 --- a/lldb/tools/debugserver/debugserver.xcodeproj/project.pbxproj +++ b/lldb/tools/debugserver/debugserver.xcodeproj/project.pbxproj @@ -7,131 +7,165 @@ objects = { /* Begin PBXBuildFile section */ - 23562ED61D342A5A00AB2BD4 /* ActivityStore.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23562ED51D342A5A00AB2BD4 /* ActivityStore.cpp */; }; - 23562ED71D342A5A00AB2BD4 /* ActivityStore.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23562ED51D342A5A00AB2BD4 /* ActivityStore.cpp */; }; - 26CE05C5115C36590022F371 /* CFBundle.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2695DD910D3EBFF6007E4CA2 /* CFBundle.cpp */; }; - 456F67641AD46CE9002850C2 /* CFBundle.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2695DD910D3EBFF6007E4CA2 /* CFBundle.cpp */; }; - 26CE05C3115C36580022F371 /* CFString.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2695DD9B0D3EC160007E4CA2 /* CFString.cpp */; }; - 456F67621AD46CE9002850C2 /* CFString.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2695DD9B0D3EC160007E4CA2 /* CFString.cpp */; }; - 26CE05CF115C36F70022F371 /* CoreFoundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 26ACA3340D3E956300A2120B /* CoreFoundation.framework */; settings = {ATTRIBUTES = (Required, ); }; }; - 456F676B1AD46CE9002850C2 /* CoreFoundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 26ACA3340D3E956300A2120B /* CoreFoundation.framework */; settings = {ATTRIBUTES = (Required, ); }; }; - 26CE05B7115C363B0022F371 /* DNB.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637D60C71334A0024798E /* DNB.cpp */; }; - 456F67551AD46CE9002850C2 /* DNB.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637D60C71334A0024798E /* DNB.cpp */; }; - 264D5D581293835600ED4C01 /* DNBArch.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 264D5D571293835600ED4C01 /* DNBArch.cpp */; }; - 456F67671AD46CE9002850C2 /* DNBArch.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 264D5D571293835600ED4C01 /* DNBArch.cpp */; }; - 26CE05C1115C36510022F371 /* DNBArchImpl.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2675D4220CCEB705000F49AF /* DNBArchImpl.cpp */; }; - 26CE05C2115C36550022F371 /* DNBArchImpl.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637FB0C71334A0024798E /* DNBArchImpl.cpp */; }; - 456F67601AD46CE9002850C2 /* DNBArchImpl.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2675D4220CCEB705000F49AF /* DNBArchImpl.cpp */; }; - 456F67611AD46CE9002850C2 /* DNBArchImpl.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637FB0C71334A0024798E /* DNBArchImpl.cpp */; }; - 266B5ED11460A68200E43F0A /* DNBArchImplARM64.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 266B5ECF1460A68200E43F0A /* DNBArchImplARM64.cpp */; }; - 456F67691AD46CE9002850C2 /* DNBArchImplARM64.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 266B5ECF1460A68200E43F0A /* DNBArchImplARM64.cpp */; }; - 26CE05C0115C364F0022F371 /* DNBArchImplI386.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637EA0C71334A0024798E /* DNBArchImplI386.cpp */; }; - 456F675F1AD46CE9002850C2 /* DNBArchImplI386.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637EA0C71334A0024798E /* DNBArchImplI386.cpp */; }; - 26CE05BF115C364D0022F371 /* DNBArchImplX86_64.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26CF99A21142EB7400011AAB /* DNBArchImplX86_64.cpp */; }; - 456F675E1AD46CE9002850C2 /* DNBArchImplX86_64.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26CF99A21142EB7400011AAB /* DNBArchImplX86_64.cpp */; }; - 26CE05B8115C363C0022F371 /* DNBBreakpoint.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637D90C71334A0024798E /* DNBBreakpoint.cpp */; }; - 456F67571AD46CE9002850C2 /* DNBBreakpoint.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637D90C71334A0024798E /* DNBBreakpoint.cpp */; }; - 26CE05B9115C363D0022F371 /* DNBDataRef.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637DB0C71334A0024798E /* DNBDataRef.cpp */; }; - 456F67581AD46CE9002850C2 /* DNBDataRef.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637DB0C71334A0024798E /* DNBDataRef.cpp */; }; - 26CE05A7115C360D0022F371 /* DNBError.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637DE0C71334A0024798E /* DNBError.cpp */; }; - 456F67461AD46CE9002850C2 /* DNBError.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637DE0C71334A0024798E /* DNBError.cpp */; }; - 26CE05BA115C363E0022F371 /* DNBLog.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637E00C71334A0024798E /* DNBLog.cpp */; }; - 456F67591AD46CE9002850C2 /* DNBLog.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637E00C71334A0024798E /* DNBLog.cpp */; }; - 26CE05BB115C363F0022F371 /* DNBRegisterInfo.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637E20C71334A0024798E /* DNBRegisterInfo.cpp */; }; - 456F675A1AD46CE9002850C2 /* DNBRegisterInfo.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637E20C71334A0024798E /* DNBRegisterInfo.cpp */; }; - 26CE05A8115C36170022F371 /* DNBThreadResumeActions.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 260E7331114BFFE600D1DFB3 /* DNBThreadResumeActions.cpp */; }; - 456F67471AD46CE9002850C2 /* DNBThreadResumeActions.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 260E7331114BFFE600D1DFB3 /* DNBThreadResumeActions.cpp */; }; - 23AE72E41D25DECF00945BCE /* DarwinLogCollector.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23AE72E21D25DECF00945BCE /* DarwinLogCollector.cpp */; }; - 23AE72E51D25DEE100945BCE /* DarwinLogCollector.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23AE72E21D25DECF00945BCE /* DarwinLogCollector.cpp */; }; - 49D404621E39260F00570CDC /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 49D404611E39260F00570CDC /* Foundation.framework */; }; - AFA3FCA11E39984900218D5E /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 49D404611E39260F00570CDC /* Foundation.framework */; }; - 456F67561AD46CE9002850C2 /* Genealogy.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AFEC3363194A8B0B00FF05C6 /* Genealogy.cpp */; }; - AFEC3364194A8B0B00FF05C6 /* Genealogy.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AFEC3363194A8B0B00FF05C6 /* Genealogy.cpp */; }; 23043C9D1D35DBEC00FC25CA /* JSON.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 233B4EA51D2DB54300E98261 /* JSON.cpp */; }; + 23043C9E1D35DBFA00FC25CA /* StringConvert.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 233B4EA81D2DB96A00E98261 /* StringConvert.cpp */; }; + 2307CCCB1D4A5D630016ABC0 /* LogFilterExactMatch.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 237821AE1D4917D20028B7A1 /* LogFilterExactMatch.cpp */; }; 233B4EA71D2DB54300E98261 /* JSON.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 233B4EA51D2DB54300E98261 /* JSON.cpp */; }; + 233B4EA91D2DB96A00E98261 /* StringConvert.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 233B4EA81D2DB96A00E98261 /* StringConvert.cpp */; }; + 23562ED21D3424DF00AB2BD4 /* LogMessageOsLog.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23562ED01D3424DF00AB2BD4 /* LogMessageOsLog.cpp */; }; + 23562ED31D3424DF00AB2BD4 /* LogMessageOsLog.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23562ED01D3424DF00AB2BD4 /* LogMessageOsLog.cpp */; }; + 23562ED61D342A5A00AB2BD4 /* ActivityStore.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23562ED51D342A5A00AB2BD4 /* ActivityStore.cpp */; }; + 23562ED71D342A5A00AB2BD4 /* ActivityStore.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23562ED51D342A5A00AB2BD4 /* ActivityStore.cpp */; }; + 23562ED91D342B0000AB2BD4 /* LogMessage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23562ED81D342B0000AB2BD4 /* LogMessage.cpp */; }; + 23562EDA1D342B0000AB2BD4 /* LogMessage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23562ED81D342B0000AB2BD4 /* LogMessage.cpp */; }; + 237821B01D4917D20028B7A1 /* LogFilterExactMatch.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 237821AE1D4917D20028B7A1 /* LogFilterExactMatch.cpp */; }; 23AC04C61D2F41A00072351D /* LogFilter.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23AC04C41D2F41A00072351D /* LogFilter.cpp */; }; 23AC04C71D2F41A00072351D /* LogFilter.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23AC04C41D2F41A00072351D /* LogFilter.cpp */; }; 23AC04CA1D2F42250072351D /* LogFilterChain.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23AC04C81D2F42250072351D /* LogFilterChain.cpp */; }; 23AC04CB1D2F42250072351D /* LogFilterChain.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23AC04C81D2F42250072351D /* LogFilterChain.cpp */; }; - 2307CCCB1D4A5D630016ABC0 /* LogFilterExactMatch.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 237821AE1D4917D20028B7A1 /* LogFilterExactMatch.cpp */; }; - 237821B01D4917D20028B7A1 /* LogFilterExactMatch.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 237821AE1D4917D20028B7A1 /* LogFilterExactMatch.cpp */; }; 23AC04CF1D2F58AF0072351D /* LogFilterRegex.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23AC04CD1D2F58AF0072351D /* LogFilterRegex.cpp */; }; 23AC04D01D2F58AF0072351D /* LogFilterRegex.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23AC04CD1D2F58AF0072351D /* LogFilterRegex.cpp */; }; - 23562ED91D342B0000AB2BD4 /* LogMessage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23562ED81D342B0000AB2BD4 /* LogMessage.cpp */; }; - 23562EDA1D342B0000AB2BD4 /* LogMessage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23562ED81D342B0000AB2BD4 /* LogMessage.cpp */; }; - 23562ED21D3424DF00AB2BD4 /* LogMessageOsLog.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23562ED01D3424DF00AB2BD4 /* LogMessageOsLog.cpp */; }; - 23562ED31D3424DF00AB2BD4 /* LogMessageOsLog.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23562ED01D3424DF00AB2BD4 /* LogMessageOsLog.cpp */; }; + 23AE72E41D25DECF00945BCE /* DarwinLogCollector.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23AE72E21D25DECF00945BCE /* DarwinLogCollector.cpp */; }; + 23AE72E51D25DEE100945BCE /* DarwinLogCollector.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23AE72E21D25DECF00945BCE /* DarwinLogCollector.cpp */; }; + 23D1B0291D497E8B00FF831B /* OsLogger.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23D1B0271D497E8B00FF831B /* OsLogger.cpp */; }; + 23D1B02A1D497E8B00FF831B /* OsLogger.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23D1B0271D497E8B00FF831B /* OsLogger.cpp */; }; + 264D5D581293835600ED4C01 /* DNBArch.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 264D5D571293835600ED4C01 /* DNBArch.cpp */; }; + 266B5ED11460A68200E43F0A /* DNBArchImplARM64.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 266B5ECF1460A68200E43F0A /* DNBArchImplARM64.cpp */; }; + 26CE05A7115C360D0022F371 /* DNBError.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637DE0C71334A0024798E /* DNBError.cpp */; }; + 26CE05A8115C36170022F371 /* DNBThreadResumeActions.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 260E7331114BFFE600D1DFB3 /* DNBThreadResumeActions.cpp */; }; + 26CE05A9115C36250022F371 /* debugserver.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A02918114AB9240029C479 /* debugserver.cpp */; }; + 26CE05AA115C36260022F371 /* RNBContext.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A68F7E0D104EC800665A9E /* RNBContext.cpp */; }; + 26CE05AB115C36270022F371 /* RNBServices.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EF8878A00D9C797C001831DA /* RNBServices.cpp */; }; + 26CE05AC115C36280022F371 /* RNBSocket.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A68FB00D1054DA00665A9E /* RNBSocket.cpp */; }; + 26CE05AD115C36280022F371 /* RNBRemote.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A68FD60D10574500665A9E /* RNBRemote.cpp */; }; + 26CE05AE115C36320022F371 /* dbgnub-mig.defs in Sources */ = {isa = PBXBuildFile; fileRef = 26C637E80C71334A0024798E /* dbgnub-mig.defs */; settings = {ATTRIBUTES = (Client, Server, ); }; }; 26CE05B0115C36340022F371 /* MachException.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637EE0C71334A0024798E /* MachException.cpp */; }; - 456F674E1AD46CE9002850C2 /* MachException.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637EE0C71334A0024798E /* MachException.cpp */; }; 26CE05B1115C36350022F371 /* MachProcess.mm in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F00C71334A0024798E /* MachProcess.mm */; }; - 456F674F1AD46CE9002850C2 /* MachProcess.mm in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F00C71334A0024798E /* MachProcess.mm */; }; - 26CE05B6115C36390022F371 /* MachTask.mm in Sources */ = {isa = PBXBuildFile; fileRef = 26B67DE10EE9BC30006C8BC0 /* MachTask.mm */; }; - 456F67541AD46CE9002850C2 /* MachTask.mm in Sources */ = {isa = PBXBuildFile; fileRef = 26B67DE10EE9BC30006C8BC0 /* MachTask.mm */; }; 26CE05B2115C36360022F371 /* MachThread.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F20C71334A0024798E /* MachThread.cpp */; }; - 456F67501AD46CE9002850C2 /* MachThread.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F20C71334A0024798E /* MachThread.cpp */; }; 26CE05B3115C36370022F371 /* MachThreadList.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F40C71334A0024798E /* MachThreadList.cpp */; }; - 456F67511AD46CE9002850C2 /* MachThreadList.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F40C71334A0024798E /* MachThreadList.cpp */; }; 26CE05B4115C36380022F371 /* MachVMMemory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F60C71334A0024798E /* MachVMMemory.cpp */; }; - 456F67521AD46CE9002850C2 /* MachVMMemory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F60C71334A0024798E /* MachVMMemory.cpp */; }; 26CE05B5115C36380022F371 /* MachVMRegion.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F80C71334A0024798E /* MachVMRegion.cpp */; }; - 456F67531AD46CE9002850C2 /* MachVMRegion.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F80C71334A0024798E /* MachVMRegion.cpp */; }; - 23D1B0291D497E8B00FF831B /* OsLogger.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23D1B0271D497E8B00FF831B /* OsLogger.cpp */; }; - 23D1B02A1D497E8B00FF831B /* OsLogger.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23D1B0271D497E8B00FF831B /* OsLogger.cpp */; }; + 26CE05B6115C36390022F371 /* MachTask.mm in Sources */ = {isa = PBXBuildFile; fileRef = 26B67DE10EE9BC30006C8BC0 /* MachTask.mm */; }; + 26CE05B7115C363B0022F371 /* DNB.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637D60C71334A0024798E /* DNB.cpp */; }; + 26CE05B8115C363C0022F371 /* DNBBreakpoint.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637D90C71334A0024798E /* DNBBreakpoint.cpp */; }; + 26CE05B9115C363D0022F371 /* DNBDataRef.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637DB0C71334A0024798E /* DNBDataRef.cpp */; }; + 26CE05BA115C363E0022F371 /* DNBLog.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637E00C71334A0024798E /* DNBLog.cpp */; }; + 26CE05BB115C363F0022F371 /* DNBRegisterInfo.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637E20C71334A0024798E /* DNBRegisterInfo.cpp */; }; 26CE05BC115C36420022F371 /* PThreadEvent.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637FE0C71334A0024798E /* PThreadEvent.cpp */; }; - 456F675B1AD46CE9002850C2 /* PThreadEvent.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637FE0C71334A0024798E /* PThreadEvent.cpp */; }; 26CE05BD115C36430022F371 /* PThreadMutex.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2672DBEE0EEF446700E92059 /* PThreadMutex.cpp */; }; - 456F675C1AD46CE9002850C2 /* PThreadMutex.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2672DBEE0EEF446700E92059 /* PThreadMutex.cpp */; }; + 26CE05BE115C36440022F371 /* SysSignal.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C638010C71334A0024798E /* SysSignal.cpp */; }; + 26CE05BF115C364D0022F371 /* DNBArchImplX86_64.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26CF99A21142EB7400011AAB /* DNBArchImplX86_64.cpp */; }; + 26CE05C0115C364F0022F371 /* DNBArchImplI386.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637EA0C71334A0024798E /* DNBArchImplI386.cpp */; }; + 26CE05C1115C36510022F371 /* DNBArchImpl.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2675D4220CCEB705000F49AF /* DNBArchImpl.cpp */; }; + 26CE05C3115C36580022F371 /* CFString.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2695DD9B0D3EC160007E4CA2 /* CFString.cpp */; }; + 26CE05C5115C36590022F371 /* CFBundle.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2695DD910D3EBFF6007E4CA2 /* CFBundle.cpp */; }; + 26CE05CF115C36F70022F371 /* CoreFoundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 26ACA3340D3E956300A2120B /* CoreFoundation.framework */; settings = {ATTRIBUTES = (Required, ); }; }; 26CE05F1115C387C0022F371 /* PseudoTerminal.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AF67ABFF0D34604D0022D128 /* PseudoTerminal.cpp */; }; - 456F67651AD46CE9002850C2 /* PseudoTerminal.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AF67ABFF0D34604D0022D128 /* PseudoTerminal.cpp */; }; - 26CE05AA115C36260022F371 /* RNBContext.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A68F7E0D104EC800665A9E /* RNBContext.cpp */; }; + 456F67461AD46CE9002850C2 /* DNBError.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637DE0C71334A0024798E /* DNBError.cpp */; }; + 456F67471AD46CE9002850C2 /* DNBThreadResumeActions.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 260E7331114BFFE600D1DFB3 /* DNBThreadResumeActions.cpp */; }; + 456F67481AD46CE9002850C2 /* debugserver.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A02918114AB9240029C479 /* debugserver.cpp */; }; 456F67491AD46CE9002850C2 /* RNBContext.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A68F7E0D104EC800665A9E /* RNBContext.cpp */; }; - 26CE05AD115C36280022F371 /* RNBRemote.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A68FD60D10574500665A9E /* RNBRemote.cpp */; }; - 456F674C1AD46CE9002850C2 /* RNBRemote.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A68FD60D10574500665A9E /* RNBRemote.cpp */; }; - 26CE05AB115C36270022F371 /* RNBServices.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EF8878A00D9C797C001831DA /* RNBServices.cpp */; }; 456F674A1AD46CE9002850C2 /* RNBServices.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EF8878A00D9C797C001831DA /* RNBServices.cpp */; }; - 26CE05AC115C36280022F371 /* RNBSocket.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A68FB00D1054DA00665A9E /* RNBSocket.cpp */; }; 456F674B1AD46CE9002850C2 /* RNBSocket.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A68FB00D1054DA00665A9E /* RNBSocket.cpp */; }; - AF588449206077BD00A0CB5A /* SocketAddress.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D6631CA81E848FE9006A7B11 /* SocketAddress.cpp */; }; - D6631CA91E848FE9006A7B11 /* SocketAddress.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D6631CA81E848FE9006A7B11 /* SocketAddress.cpp */; }; + 456F674C1AD46CE9002850C2 /* RNBRemote.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A68FD60D10574500665A9E /* RNBRemote.cpp */; }; + 456F674D1AD46CE9002850C2 /* dbgnub-mig.defs in Sources */ = {isa = PBXBuildFile; fileRef = 26C637E80C71334A0024798E /* dbgnub-mig.defs */; settings = {ATTRIBUTES = (Client, Server, ); }; }; + 456F674E1AD46CE9002850C2 /* MachException.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637EE0C71334A0024798E /* MachException.cpp */; }; + 456F674F1AD46CE9002850C2 /* MachProcess.mm in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F00C71334A0024798E /* MachProcess.mm */; }; + 456F67501AD46CE9002850C2 /* MachThread.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F20C71334A0024798E /* MachThread.cpp */; }; + 456F67511AD46CE9002850C2 /* MachThreadList.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F40C71334A0024798E /* MachThreadList.cpp */; }; + 456F67521AD46CE9002850C2 /* MachVMMemory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F60C71334A0024798E /* MachVMMemory.cpp */; }; + 456F67531AD46CE9002850C2 /* MachVMRegion.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F80C71334A0024798E /* MachVMRegion.cpp */; }; + 456F67541AD46CE9002850C2 /* MachTask.mm in Sources */ = {isa = PBXBuildFile; fileRef = 26B67DE10EE9BC30006C8BC0 /* MachTask.mm */; }; + 456F67551AD46CE9002850C2 /* DNB.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637D60C71334A0024798E /* DNB.cpp */; }; + 456F67561AD46CE9002850C2 /* Genealogy.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AFEC3363194A8B0B00FF05C6 /* Genealogy.cpp */; }; + 456F67571AD46CE9002850C2 /* DNBBreakpoint.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637D90C71334A0024798E /* DNBBreakpoint.cpp */; }; + 456F67581AD46CE9002850C2 /* DNBDataRef.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637DB0C71334A0024798E /* DNBDataRef.cpp */; }; + 456F67591AD46CE9002850C2 /* DNBLog.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637E00C71334A0024798E /* DNBLog.cpp */; }; + 456F675A1AD46CE9002850C2 /* DNBRegisterInfo.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637E20C71334A0024798E /* DNBRegisterInfo.cpp */; }; + 456F675B1AD46CE9002850C2 /* PThreadEvent.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637FE0C71334A0024798E /* PThreadEvent.cpp */; }; + 456F675C1AD46CE9002850C2 /* PThreadMutex.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2672DBEE0EEF446700E92059 /* PThreadMutex.cpp */; }; + 456F675D1AD46CE9002850C2 /* SysSignal.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C638010C71334A0024798E /* SysSignal.cpp */; }; + 456F675E1AD46CE9002850C2 /* DNBArchImplX86_64.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26CF99A21142EB7400011AAB /* DNBArchImplX86_64.cpp */; }; + 456F675F1AD46CE9002850C2 /* DNBArchImplI386.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637EA0C71334A0024798E /* DNBArchImplI386.cpp */; }; + 456F67601AD46CE9002850C2 /* DNBArchImpl.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2675D4220CCEB705000F49AF /* DNBArchImpl.cpp */; }; + 456F67621AD46CE9002850C2 /* CFString.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2695DD9B0D3EC160007E4CA2 /* CFString.cpp */; }; + 456F67641AD46CE9002850C2 /* CFBundle.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2695DD910D3EBFF6007E4CA2 /* CFBundle.cpp */; }; + 456F67651AD46CE9002850C2 /* PseudoTerminal.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AF67ABFF0D34604D0022D128 /* PseudoTerminal.cpp */; }; + 456F67671AD46CE9002850C2 /* DNBArch.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 264D5D571293835600ED4C01 /* DNBArch.cpp */; }; + 456F67691AD46CE9002850C2 /* DNBArchImplARM64.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 266B5ECF1460A68200E43F0A /* DNBArchImplARM64.cpp */; }; + 456F676B1AD46CE9002850C2 /* CoreFoundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 26ACA3340D3E956300A2120B /* CoreFoundation.framework */; settings = {ATTRIBUTES = (Required, ); }; }; + 49D404621E39260F00570CDC /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 49D404611E39260F00570CDC /* Foundation.framework */; }; AF48558C1D75126800D19C07 /* StdStringExtractor.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AF48558B1D75126800D19C07 /* StdStringExtractor.cpp */; }; AF48558D1D75127500D19C07 /* StdStringExtractor.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AF48558B1D75126800D19C07 /* StdStringExtractor.cpp */; }; - 23043C9E1D35DBFA00FC25CA /* StringConvert.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 233B4EA81D2DB96A00E98261 /* StringConvert.cpp */; }; - 233B4EA91D2DB96A00E98261 /* StringConvert.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 233B4EA81D2DB96A00E98261 /* StringConvert.cpp */; }; - 26CE05BE115C36440022F371 /* SysSignal.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C638010C71334A0024798E /* SysSignal.cpp */; }; - 456F675D1AD46CE9002850C2 /* SysSignal.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C638010C71334A0024798E /* SysSignal.cpp */; }; - 26CE05AE115C36320022F371 /* dbgnub-mig.defs in Sources */ = {isa = PBXBuildFile; fileRef = 26C637E80C71334A0024798E /* dbgnub-mig.defs */; settings = {ATTRIBUTES = (Client, Server, ); }; }; - 456F674D1AD46CE9002850C2 /* dbgnub-mig.defs in Sources */ = {isa = PBXBuildFile; fileRef = 26C637E80C71334A0024798E /* dbgnub-mig.defs */; settings = {ATTRIBUTES = (Client, Server, ); }; }; - 26CE05A9115C36250022F371 /* debugserver.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A02918114AB9240029C479 /* debugserver.cpp */; }; - 456F67481AD46CE9002850C2 /* debugserver.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A02918114AB9240029C479 /* debugserver.cpp */; }; + AF588449206077BD00A0CB5A /* SocketAddress.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D6631CA81E848FE9006A7B11 /* SocketAddress.cpp */; }; + AFA3FCA11E39984900218D5E /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 49D404611E39260F00570CDC /* Foundation.framework */; }; + AFEC3364194A8B0B00FF05C6 /* Genealogy.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AFEC3363194A8B0B00FF05C6 /* Genealogy.cpp */; }; + D6631CA91E848FE9006A7B11 /* SocketAddress.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D6631CA81E848FE9006A7B11 /* SocketAddress.cpp */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ - 23562ED51D342A5A00AB2BD4 /* ActivityStore.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ActivityStore.cpp; sourceTree = ""; }; + 2307CCCC1D4A5DAE0016ABC0 /* CMakeLists.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = CMakeLists.txt; sourceTree = ""; }; + 233B4EA51D2DB54300E98261 /* JSON.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = JSON.cpp; sourceTree = ""; }; + 233B4EA61D2DB54300E98261 /* JSON.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = JSON.h; sourceTree = ""; }; + 233B4EA81D2DB96A00E98261 /* StringConvert.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = StringConvert.cpp; path = ../../../source/Host/common/StringConvert.cpp; sourceTree = ""; }; + 23562ECF1D34110D00AB2BD4 /* DarwinLogTypes.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = DarwinLogTypes.h; sourceTree = ""; }; + 23562ED01D3424DF00AB2BD4 /* LogMessageOsLog.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogMessageOsLog.cpp; sourceTree = ""; }; + 23562ED11D3424DF00AB2BD4 /* LogMessageOsLog.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = LogMessageOsLog.h; sourceTree = ""; }; 23562ED41D3426DD00AB2BD4 /* ActivityStore.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ActivityStore.h; sourceTree = ""; }; + 23562ED51D342A5A00AB2BD4 /* ActivityStore.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ActivityStore.cpp; sourceTree = ""; }; + 23562ED81D342B0000AB2BD4 /* LogMessage.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogMessage.cpp; sourceTree = ""; }; + 237821AD1D4917D20028B7A1 /* CMakeLists.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = CMakeLists.txt; sourceTree = ""; }; + 237821AE1D4917D20028B7A1 /* LogFilterExactMatch.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogFilterExactMatch.cpp; sourceTree = ""; }; + 237821AF1D4917D20028B7A1 /* LogFilterExactMatch.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = LogFilterExactMatch.h; sourceTree = ""; }; + 23AC04C41D2F41A00072351D /* LogFilter.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogFilter.cpp; sourceTree = ""; }; + 23AC04C51D2F41A00072351D /* LogFilter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = LogFilter.h; sourceTree = ""; }; + 23AC04C81D2F42250072351D /* LogFilterChain.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogFilterChain.cpp; sourceTree = ""; }; + 23AC04C91D2F42250072351D /* LogFilterChain.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = LogFilterChain.h; sourceTree = ""; }; + 23AC04CC1D2F42F10072351D /* DarwinLogInterfaces.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = DarwinLogInterfaces.h; sourceTree = ""; }; + 23AC04CD1D2F58AF0072351D /* LogFilterRegex.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogFilterRegex.cpp; sourceTree = ""; }; + 23AC04CE1D2F58AF0072351D /* LogFilterRegex.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = LogFilterRegex.h; sourceTree = ""; }; + 23AC04D11D2F60130072351D /* LogMessage.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = LogMessage.h; sourceTree = ""; }; + 23AE72E21D25DECF00945BCE /* DarwinLogCollector.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = DarwinLogCollector.cpp; sourceTree = ""; }; + 23AE72E31D25DECF00945BCE /* DarwinLogCollector.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DarwinLogCollector.h; sourceTree = ""; }; 23AE72E61D25DEFB00945BCE /* ActivityStreamSPI.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ActivityStreamSPI.h; sourceTree = ""; }; + 23CF6F5E1D28A3760088ADC9 /* DarwinLogEvent.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = DarwinLogEvent.h; sourceTree = ""; }; + 23D1B0271D497E8B00FF831B /* OsLogger.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = OsLogger.cpp; sourceTree = ""; }; + 23D1B0281D497E8B00FF831B /* OsLogger.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = OsLogger.h; sourceTree = ""; }; + 260828DE0CBAF7F400F95054 /* DNBRuntimeAction.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DNBRuntimeAction.h; sourceTree = ""; }; + 260E7331114BFFE600D1DFB3 /* DNBThreadResumeActions.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = DNBThreadResumeActions.cpp; sourceTree = ""; }; + 260E7332114BFFE600D1DFB3 /* DNBThreadResumeActions.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DNBThreadResumeActions.h; sourceTree = ""; }; + 260FC7320E5B290400043FC9 /* debugnub-exports */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = "debugnub-exports"; sourceTree = SOURCE_ROOT; }; + 26203D1C1641EFB200A662F7 /* com.apple.debugserver.applist.internal.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = com.apple.debugserver.applist.internal.plist; sourceTree = ""; }; + 26203D1D1641EFB200A662F7 /* com.apple.debugserver.internal.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = com.apple.debugserver.internal.plist; sourceTree = ""; }; + 26242C390DDBD33C0054A4CC /* debugserver-entitlements.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = "debugserver-entitlements.plist"; sourceTree = ""; }; + 264D5D571293835600ED4C01 /* DNBArch.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = DNBArch.cpp; sourceTree = ""; }; + 264F679A1B2F9EB200140093 /* JSONGenerator.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = JSONGenerator.h; sourceTree = ""; }; + 26593A060D4931CC001C9FE3 /* ChangeLog */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = ChangeLog; sourceTree = ""; }; + 266B5ECF1460A68200E43F0A /* DNBArchImplARM64.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = DNBArchImplARM64.cpp; sourceTree = ""; }; + 266B5ED01460A68200E43F0A /* DNBArchImplARM64.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DNBArchImplARM64.h; sourceTree = ""; }; + 2672DBEE0EEF446700E92059 /* PThreadMutex.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PThreadMutex.cpp; sourceTree = ""; }; + 2675D4220CCEB705000F49AF /* DNBArchImpl.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; name = DNBArchImpl.cpp; path = arm/DNBArchImpl.cpp; sourceTree = ""; }; + 2675D4230CCEB705000F49AF /* DNBArchImpl.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; name = DNBArchImpl.h; path = arm/DNBArchImpl.h; sourceTree = ""; }; 2695DD910D3EBFF6007E4CA2 /* CFBundle.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CFBundle.cpp; sourceTree = ""; }; 2695DD920D3EBFF6007E4CA2 /* CFBundle.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CFBundle.h; sourceTree = ""; }; - 2695DD9B0D3EC160007E4CA2 /* CFString.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CFString.cpp; sourceTree = ""; }; 2695DD9A0D3EC160007E4CA2 /* CFString.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CFString.h; sourceTree = ""; }; - 26C637E70C71334A0024798E /* CFUtils.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = CFUtils.h; sourceTree = ""; }; - 2307CCCC1D4A5DAE0016ABC0 /* CMakeLists.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = CMakeLists.txt; sourceTree = ""; }; - 237821AD1D4917D20028B7A1 /* CMakeLists.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = CMakeLists.txt; sourceTree = ""; }; - 26593A060D4931CC001C9FE3 /* ChangeLog */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = ChangeLog; sourceTree = ""; }; + 2695DD9B0D3EC160007E4CA2 /* CFString.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CFString.cpp; sourceTree = ""; }; + 269E8DF8164B2ED200AD65F6 /* com.apple.debugserver.posix.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = com.apple.debugserver.posix.plist; sourceTree = ""; }; + 26A02918114AB9240029C479 /* debugserver.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = debugserver.cpp; sourceTree = ""; }; + 26A4BAED0D498B7D00A9BEAB /* com.apple.debugserver.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = com.apple.debugserver.plist; sourceTree = ""; }; + 26A68F7D0D104EC800665A9E /* RNBContext.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNBContext.h; sourceTree = ""; }; + 26A68F7E0D104EC800665A9E /* RNBContext.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = RNBContext.cpp; sourceTree = ""; }; + 26A68FAF0D1054DA00665A9E /* RNBSocket.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNBSocket.h; sourceTree = ""; }; + 26A68FB00D1054DA00665A9E /* RNBSocket.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = RNBSocket.cpp; sourceTree = ""; }; + 26A68FD50D10574500665A9E /* RNBRemote.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNBRemote.h; sourceTree = ""; }; + 26A68FD60D10574500665A9E /* RNBRemote.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = RNBRemote.cpp; sourceTree = ""; }; + 26A8FE1E0D11A77B00203048 /* DNBTimer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DNBTimer.h; sourceTree = ""; }; 26ACA3340D3E956300A2120B /* CoreFoundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreFoundation.framework; path = System/Library/Frameworks/CoreFoundation.framework; sourceTree = SDKROOT; }; + 26B67DE00EE9BC30006C8BC0 /* MachTask.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MachTask.h; sourceTree = ""; }; + 26B67DE10EE9BC30006C8BC0 /* MachTask.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MachTask.mm; sourceTree = ""; }; 26C637D60C71334A0024798E /* DNB.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = DNB.cpp; sourceTree = ""; }; 26C637D70C71334A0024798E /* DNB.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = DNB.h; sourceTree = ""; }; - 264D5D571293835600ED4C01 /* DNBArch.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = DNBArch.cpp; sourceTree = ""; }; 26C637D80C71334A0024798E /* DNBArch.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = DNBArch.h; sourceTree = ""; }; - 2675D4220CCEB705000F49AF /* DNBArchImpl.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; name = DNBArchImpl.cpp; path = arm/DNBArchImpl.cpp; sourceTree = ""; }; - 26C637FB0C71334A0024798E /* DNBArchImpl.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = DNBArchImpl.cpp; sourceTree = ""; }; - 2675D4230CCEB705000F49AF /* DNBArchImpl.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; name = DNBArchImpl.h; path = arm/DNBArchImpl.h; sourceTree = ""; }; - 26C637FC0C71334A0024798E /* DNBArchImpl.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = DNBArchImpl.h; sourceTree = ""; }; - 266B5ECF1460A68200E43F0A /* DNBArchImplARM64.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = DNBArchImplARM64.cpp; sourceTree = ""; }; - 266B5ED01460A68200E43F0A /* DNBArchImplARM64.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DNBArchImplARM64.h; sourceTree = ""; }; - 26C637EA0C71334A0024798E /* DNBArchImplI386.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = DNBArchImplI386.cpp; sourceTree = ""; }; - 26C637EB0C71334A0024798E /* DNBArchImplI386.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = DNBArchImplI386.h; sourceTree = ""; }; - 26CF99A21142EB7400011AAB /* DNBArchImplX86_64.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = DNBArchImplX86_64.cpp; sourceTree = ""; }; - 26CF99A31142EB7400011AAB /* DNBArchImplX86_64.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DNBArchImplX86_64.h; sourceTree = ""; }; 26C637D90C71334A0024798E /* DNBBreakpoint.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = DNBBreakpoint.cpp; sourceTree = ""; }; 26C637DA0C71334A0024798E /* DNBBreakpoint.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = DNBBreakpoint.h; sourceTree = ""; }; 26C637DB0C71334A0024798E /* DNBDataRef.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = DNBDataRef.cpp; sourceTree = ""; }; @@ -143,42 +177,14 @@ 26C637E10C71334A0024798E /* DNBLog.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = DNBLog.h; sourceTree = ""; }; 26C637E20C71334A0024798E /* DNBRegisterInfo.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = DNBRegisterInfo.cpp; sourceTree = ""; }; 26C637E30C71334A0024798E /* DNBRegisterInfo.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = DNBRegisterInfo.h; sourceTree = ""; }; - 260828DE0CBAF7F400F95054 /* DNBRuntimeAction.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DNBRuntimeAction.h; sourceTree = ""; }; - 260E7331114BFFE600D1DFB3 /* DNBThreadResumeActions.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = DNBThreadResumeActions.cpp; sourceTree = ""; }; - 260E7332114BFFE600D1DFB3 /* DNBThreadResumeActions.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DNBThreadResumeActions.h; sourceTree = ""; }; - 26A8FE1E0D11A77B00203048 /* DNBTimer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DNBTimer.h; sourceTree = ""; }; - 23AE72E21D25DECF00945BCE /* DarwinLogCollector.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = DarwinLogCollector.cpp; sourceTree = ""; }; - 23AE72E31D25DECF00945BCE /* DarwinLogCollector.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DarwinLogCollector.h; sourceTree = ""; }; - 23CF6F5E1D28A3760088ADC9 /* DarwinLogEvent.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = DarwinLogEvent.h; sourceTree = ""; }; - 23AC04CC1D2F42F10072351D /* DarwinLogInterfaces.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = DarwinLogInterfaces.h; sourceTree = ""; }; - 23562ECF1D34110D00AB2BD4 /* DarwinLogTypes.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = DarwinLogTypes.h; sourceTree = ""; }; - 49D404611E39260F00570CDC /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; }; - AFEC3363194A8B0B00FF05C6 /* Genealogy.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Genealogy.cpp; sourceTree = ""; }; - AF0934BA18E12B92005A11FD /* Genealogy.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Genealogy.h; sourceTree = ""; }; - AF0934BB18E12B92005A11FD /* GenealogySPI.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = GenealogySPI.h; sourceTree = ""; }; - 233B4EA51D2DB54300E98261 /* JSON.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = JSON.cpp; sourceTree = ""; }; - 233B4EA61D2DB54300E98261 /* JSON.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = JSON.h; sourceTree = ""; }; - 264F679A1B2F9EB200140093 /* JSONGenerator.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = JSONGenerator.h; sourceTree = ""; }; - 23AC04C41D2F41A00072351D /* LogFilter.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogFilter.cpp; sourceTree = ""; }; - 23AC04C51D2F41A00072351D /* LogFilter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = LogFilter.h; sourceTree = ""; }; - 23AC04C81D2F42250072351D /* LogFilterChain.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogFilterChain.cpp; sourceTree = ""; }; - 23AC04C91D2F42250072351D /* LogFilterChain.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = LogFilterChain.h; sourceTree = ""; }; - 237821AE1D4917D20028B7A1 /* LogFilterExactMatch.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogFilterExactMatch.cpp; sourceTree = ""; }; - 237821AF1D4917D20028B7A1 /* LogFilterExactMatch.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = LogFilterExactMatch.h; sourceTree = ""; }; - 23AC04CD1D2F58AF0072351D /* LogFilterRegex.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogFilterRegex.cpp; sourceTree = ""; }; - 23AC04CE1D2F58AF0072351D /* LogFilterRegex.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = LogFilterRegex.h; sourceTree = ""; }; - 23562ED81D342B0000AB2BD4 /* LogMessage.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogMessage.cpp; sourceTree = ""; }; - 23AC04D11D2F60130072351D /* LogMessage.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = LogMessage.h; sourceTree = ""; }; - 23562ED01D3424DF00AB2BD4 /* LogMessageOsLog.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogMessageOsLog.cpp; sourceTree = ""; }; - 23562ED11D3424DF00AB2BD4 /* LogMessageOsLog.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = LogMessageOsLog.h; sourceTree = ""; }; + 26C637E70C71334A0024798E /* CFUtils.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = CFUtils.h; sourceTree = ""; }; + 26C637E80C71334A0024798E /* dbgnub-mig.defs */ = {isa = PBXFileReference; explicitFileType = sourcecode.mig; fileEncoding = 30; path = "dbgnub-mig.defs"; sourceTree = ""; }; + 26C637EA0C71334A0024798E /* DNBArchImplI386.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = DNBArchImplI386.cpp; sourceTree = ""; }; + 26C637EB0C71334A0024798E /* DNBArchImplI386.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = DNBArchImplI386.h; sourceTree = ""; }; 26C637EE0C71334A0024798E /* MachException.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = MachException.cpp; sourceTree = ""; }; 26C637EF0C71334A0024798E /* MachException.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = MachException.h; sourceTree = ""; }; - 26C637F10C71334A0024798E /* MachProcess.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = MachProcess.h; sourceTree = ""; }; 26C637F00C71334A0024798E /* MachProcess.mm */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.objcpp; path = MachProcess.mm; sourceTree = ""; }; - 49F530111331519C008956F6 /* MachRegisterStatesI386.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MachRegisterStatesI386.h; sourceTree = ""; }; - 49F5301213316D7F008956F6 /* MachRegisterStatesX86_64.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MachRegisterStatesX86_64.h; sourceTree = ""; }; - 26B67DE00EE9BC30006C8BC0 /* MachTask.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MachTask.h; sourceTree = ""; }; - 26B67DE10EE9BC30006C8BC0 /* MachTask.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MachTask.mm; sourceTree = ""; }; + 26C637F10C71334A0024798E /* MachProcess.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = MachProcess.h; sourceTree = ""; }; 26C637F20C71334A0024798E /* MachThread.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = MachThread.cpp; sourceTree = ""; }; 26C637F30C71334A0024798E /* MachThread.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = MachThread.h; sourceTree = ""; }; 26C637F40C71334A0024798E /* MachThreadList.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = MachThreadList.cpp; sourceTree = ""; }; @@ -187,45 +193,35 @@ 26C637F70C71334A0024798E /* MachVMMemory.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = MachVMMemory.h; sourceTree = ""; }; 26C637F80C71334A0024798E /* MachVMRegion.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = MachVMRegion.cpp; sourceTree = ""; }; 26C637F90C71334A0024798E /* MachVMRegion.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = MachVMRegion.h; sourceTree = ""; }; - 23D1B0271D497E8B00FF831B /* OsLogger.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = OsLogger.cpp; sourceTree = ""; }; - 23D1B0281D497E8B00FF831B /* OsLogger.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = OsLogger.h; sourceTree = ""; }; 26C637FD0C71334A0024798E /* PThreadCondition.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = PThreadCondition.h; sourceTree = ""; }; 26C637FE0C71334A0024798E /* PThreadEvent.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = PThreadEvent.cpp; sourceTree = ""; }; 26C637FF0C71334A0024798E /* PThreadEvent.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = PThreadEvent.h; sourceTree = ""; }; - 2672DBEE0EEF446700E92059 /* PThreadMutex.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PThreadMutex.cpp; sourceTree = ""; }; 26C638000C71334A0024798E /* PThreadMutex.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = PThreadMutex.h; sourceTree = ""; }; - AF67ABFF0D34604D0022D128 /* PseudoTerminal.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PseudoTerminal.cpp; sourceTree = ""; }; - AF67AC000D34604D0022D128 /* PseudoTerminal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PseudoTerminal.h; sourceTree = ""; }; - 26A68F7E0D104EC800665A9E /* RNBContext.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = RNBContext.cpp; sourceTree = ""; }; - 26A68F7D0D104EC800665A9E /* RNBContext.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNBContext.h; sourceTree = ""; }; - 26E6B9DA0D1329010037ECDD /* RNBDefs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNBDefs.h; sourceTree = ""; }; - 26A68FD60D10574500665A9E /* RNBRemote.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = RNBRemote.cpp; sourceTree = ""; }; - 26A68FD50D10574500665A9E /* RNBRemote.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNBRemote.h; sourceTree = ""; }; - EF8878A00D9C797C001831DA /* RNBServices.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = RNBServices.cpp; sourceTree = ""; }; - EF88789F0D9C797C001831DA /* RNBServices.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNBServices.h; sourceTree = ""; }; - 26A68FB00D1054DA00665A9E /* RNBSocket.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = RNBSocket.cpp; sourceTree = ""; }; - 26A68FAF0D1054DA00665A9E /* RNBSocket.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNBSocket.h; sourceTree = ""; }; - D6631CA81E848FE9006A7B11 /* SocketAddress.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = SocketAddress.cpp; path = ../../source/Host/common/SocketAddress.cpp; sourceTree = ""; }; - AF48558B1D75126800D19C07 /* StdStringExtractor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = StdStringExtractor.cpp; sourceTree = ""; }; - 233B4EA81D2DB96A00E98261 /* StringConvert.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = StringConvert.cpp; path = ../../../source/Host/common/StringConvert.cpp; sourceTree = ""; }; 26C638010C71334A0024798E /* SysSignal.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = SysSignal.cpp; sourceTree = ""; }; 26C638020C71334A0024798E /* SysSignal.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = SysSignal.h; sourceTree = ""; }; 26C638050C71334A0024798E /* TTYState.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = TTYState.cpp; sourceTree = ""; }; 26C638060C71334A0024798E /* TTYState.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = TTYState.h; sourceTree = ""; }; - 26203D1C1641EFB200A662F7 /* com.apple.debugserver.applist.internal.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = com.apple.debugserver.applist.internal.plist; sourceTree = ""; }; - EF88788B0D9C7558001831DA /* com.apple.debugserver.applist.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = com.apple.debugserver.applist.plist; sourceTree = ""; }; - 26203D1D1641EFB200A662F7 /* com.apple.debugserver.internal.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = com.apple.debugserver.internal.plist; sourceTree = ""; }; - 26A4BAED0D498B7D00A9BEAB /* com.apple.debugserver.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = com.apple.debugserver.plist; sourceTree = ""; }; - 269E8DF8164B2ED200AD65F6 /* com.apple.debugserver.posix.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = com.apple.debugserver.posix.plist; sourceTree = ""; }; - AF949ED620605DC2002A91F9 /* com.apple.internal.xpc.remote.debugserver.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = com.apple.internal.xpc.remote.debugserver.plist; sourceTree = ""; }; - 26C637E80C71334A0024798E /* dbgnub-mig.defs */ = {isa = PBXFileReference; explicitFileType = sourcecode.mig; fileEncoding = 30; path = "dbgnub-mig.defs"; sourceTree = ""; }; - 260FC7320E5B290400043FC9 /* debugnub-exports */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = "debugnub-exports"; sourceTree = SOURCE_ROOT; }; 26CE0594115C31C20022F371 /* debugserver */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = debugserver; sourceTree = BUILT_PRODUCTS_DIR; }; - 26242C390DDBD33C0054A4CC /* debugserver-entitlements.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = "debugserver-entitlements.plist"; sourceTree = ""; }; - AF61C60418F75ABC00B48D9D /* debugserver-macosx-entitlements.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = "debugserver-macosx-entitlements.plist"; sourceTree = ""; }; + 26CF99A21142EB7400011AAB /* DNBArchImplX86_64.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = DNBArchImplX86_64.cpp; sourceTree = ""; }; + 26CF99A31142EB7400011AAB /* DNBArchImplX86_64.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DNBArchImplX86_64.h; sourceTree = ""; }; + 26E6B9DA0D1329010037ECDD /* RNBDefs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNBDefs.h; sourceTree = ""; }; 456F67721AD46CE9002850C2 /* debugserver-nonui */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = "debugserver-nonui"; sourceTree = BUILT_PRODUCTS_DIR; }; - 26A02918114AB9240029C479 /* debugserver.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = debugserver.cpp; sourceTree = ""; }; + 49D404611E39260F00570CDC /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; }; + 49F530111331519C008956F6 /* MachRegisterStatesI386.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MachRegisterStatesI386.h; sourceTree = ""; }; + 49F5301213316D7F008956F6 /* MachRegisterStatesX86_64.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MachRegisterStatesX86_64.h; sourceTree = ""; }; 9457ECF61419864100DFE7D8 /* stack_logging.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = stack_logging.h; sourceTree = ""; }; + AF0934BA18E12B92005A11FD /* Genealogy.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Genealogy.h; sourceTree = ""; }; + AF0934BB18E12B92005A11FD /* GenealogySPI.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = GenealogySPI.h; sourceTree = ""; }; + AF48558B1D75126800D19C07 /* StdStringExtractor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = StdStringExtractor.cpp; sourceTree = ""; }; + AF61C60418F75ABC00B48D9D /* debugserver-macosx-entitlements.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = "debugserver-macosx-entitlements.plist"; sourceTree = ""; }; + AF67ABFF0D34604D0022D128 /* PseudoTerminal.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PseudoTerminal.cpp; sourceTree = ""; }; + AF67AC000D34604D0022D128 /* PseudoTerminal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PseudoTerminal.h; sourceTree = ""; }; + AF949ED620605DC2002A91F9 /* com.apple.internal.xpc.remote.debugserver.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = com.apple.internal.xpc.remote.debugserver.plist; sourceTree = ""; }; + AFEC3363194A8B0B00FF05C6 /* Genealogy.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Genealogy.cpp; sourceTree = ""; }; + D6631CA81E848FE9006A7B11 /* SocketAddress.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = SocketAddress.cpp; path = ../../source/Host/common/SocketAddress.cpp; sourceTree = ""; }; + EF88788B0D9C7558001831DA /* com.apple.debugserver.applist.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = com.apple.debugserver.applist.plist; sourceTree = ""; }; + EF88789F0D9C797C001831DA /* RNBServices.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNBServices.h; sourceTree = ""; }; + EF8878A00D9C797C001831DA /* RNBServices.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = RNBServices.cpp; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -409,7 +405,6 @@ 2675D41C0CCEB6CF000F49AF /* arm */, 266B5ECE1460A68200E43F0A /* arm64 */, 26C637E90C71334A0024798E /* i386 */, - 26C637FA0C71334A0024798E /* ppc */, 26CF99A11142EB7400011AAB /* x86_64 */, 26C637E80C71334A0024798E /* dbgnub-mig.defs */, AFEC3363194A8B0B00FF05C6 /* Genealogy.cpp */, @@ -446,15 +441,6 @@ path = i386; sourceTree = ""; }; - 26C637FA0C71334A0024798E /* ppc */ = { - isa = PBXGroup; - children = ( - 26C637FB0C71334A0024798E /* DNBArchImpl.cpp */, - 26C637FC0C71334A0024798E /* DNBArchImpl.h */, - ); - path = ppc; - sourceTree = ""; - }; 26CF99A11142EB7400011AAB /* x86_64 */ = { isa = PBXGroup; children = ( @@ -617,7 +603,6 @@ 26CE05BF115C364D0022F371 /* DNBArchImplX86_64.cpp in Sources */, 26CE05C0115C364F0022F371 /* DNBArchImplI386.cpp in Sources */, 26CE05C1115C36510022F371 /* DNBArchImpl.cpp in Sources */, - 26CE05C2115C36550022F371 /* DNBArchImpl.cpp in Sources */, 26CE05C5115C36590022F371 /* CFBundle.cpp in Sources */, 26CE05C3115C36580022F371 /* CFString.cpp in Sources */, 23562ED91D342B0000AB2BD4 /* LogMessage.cpp in Sources */, @@ -668,7 +653,6 @@ 456F67601AD46CE9002850C2 /* DNBArchImpl.cpp in Sources */, 23AC04C71D2F41A00072351D /* LogFilter.cpp in Sources */, 23043C9E1D35DBFA00FC25CA /* StringConvert.cpp in Sources */, - 456F67611AD46CE9002850C2 /* DNBArchImpl.cpp in Sources */, AF588449206077BD00A0CB5A /* SocketAddress.cpp in Sources */, 456F67621AD46CE9002850C2 /* CFString.cpp in Sources */, 23AC04CB1D2F42250072351D /* LogFilterChain.cpp in Sources */, @@ -863,6 +847,8 @@ "$(PROJECT_DIR)/resources/lldb-debugserver-Info.plist", "$(LLDB_ENERGY_LDFLAGS)", "$(LLDB_COMPRESSION_LDFLAGS)", + "-framework", + Security, ); OTHER_MIGFLAGS = "-I$(DERIVED_FILE_DIR)"; PRODUCT_NAME = debugserver; @@ -942,6 +928,8 @@ "$(PROJECT_DIR)/resources/lldb-debugserver-Info.plist", "$(LLDB_ENERGY_LDFLAGS)", "$(LLDB_COMPRESSION_LDFLAGS)", + "-framework", + Security, ); OTHER_MIGFLAGS = "-I$(DERIVED_FILE_DIR)"; PRODUCT_NAME = debugserver; @@ -1020,6 +1008,8 @@ "$(PROJECT_DIR)/resources/lldb-debugserver-Info.plist", "$(LLDB_ENERGY_LDFLAGS)", "$(LLDB_COMPRESSION_LDFLAGS)", + "-framework", + Security, ); OTHER_MIGFLAGS = "-I$(DERIVED_FILE_DIR)"; PRODUCT_NAME = debugserver; @@ -1147,9 +1137,7 @@ LLDB_ENERGY_CFLAGS = ""; "LLDB_ENERGY_CFLAGS[sdk=*.internal]" = "-DLLDB_ENERGY"; LLDB_ENERGY_LDFLAGS = "-lpmenergy -lpmsample"; - OTHER_CFLAGS = ( - "$(LLDB_ENERGY_CFLAGS)", - ); + OTHER_CFLAGS = "$(LLDB_ENERGY_CFLAGS)"; "OTHER_CFLAGS[sdk=iphoneos*][arch=*]" = ( "-Wparentheses", "-DOS_OBJECT_USE_OBJC=0", diff --git a/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp b/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp index f99dbc48b128ec..e5d4b05d987c1e 100644 --- a/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp +++ b/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp @@ -666,6 +666,112 @@ uint32_t DNBArchMachARM64::NumSupportedHardwareWatchpoints() { return g_num_supported_hw_watchpoints; } +uint32_t DNBArchMachARM64::NumSupportedHardwareBreakpoints() { + // Set the init value to something that will let us know that we need to + // autodetect how many breakpoints are supported dynamically... + static uint32_t g_num_supported_hw_breakpoints = UINT_MAX; + if (g_num_supported_hw_breakpoints == UINT_MAX) { + // Set this to zero in case we can't tell if there are any HW breakpoints + g_num_supported_hw_breakpoints = 0; + + size_t len; + uint32_t n = 0; + len = sizeof(n); + if (::sysctlbyname("hw.optional.breakpoint", &n, &len, NULL, 0) == 0) { + g_num_supported_hw_breakpoints = n; + DNBLogThreadedIf(LOG_THREAD, "hw.optional.breakpoint=%u", n); + } else { +// For AArch64 we would need to look at ID_AA64DFR0_EL1 but debugserver runs in +// EL0 so it can't access that reg. The kernel should have filled in the +// sysctls based on it though. +#if defined(__arm__) + uint32_t register_DBGDIDR; + + asm("mrc p14, 0, %0, c0, c0, 0" : "=r"(register_DBGDIDR)); + uint32_t numWRPs = bits(register_DBGDIDR, 31, 28); + // Zero is reserved for the WRP count, so don't increment it if it is zero + if (numWRPs > 0) + numWRPs++; + g_num_supported_hw_breakpoints = numWRPs; + DNBLogThreadedIf(LOG_THREAD, + "Number of supported hw breakpoint via asm(): %d", + g_num_supported_hw_breakpoints); +#endif + } + } + return g_num_supported_hw_breakpoints; +} + +uint32_t DNBArchMachARM64::EnableHardwareBreakpoint(nub_addr_t addr, + nub_size_t size, + bool also_set_on_task) { + DNBLogThreadedIf(LOG_WATCHPOINTS, + "DNBArchMachARM64::EnableHardwareBreakpoint(addr = " + "0x%8.8llx, size = %zu)", + (uint64_t)addr, size); + + const uint32_t num_hw_breakpoints = NumSupportedHardwareBreakpoints(); + + nub_addr_t aligned_bp_address = addr; + uint32_t control_value = 0; + + switch (size) { + case 2: + control_value = (0x3 << 5) | 7; + aligned_bp_address &= ~1; + break; + case 4: + control_value = (0xfu << 5) | 7; + aligned_bp_address &= ~3; + break; + }; + + // Read the debug state + kern_return_t kret = GetDBGState(false); + if (kret == KERN_SUCCESS) { + // Check to make sure we have the needed hardware support + uint32_t i = 0; + + for (i = 0; i < num_hw_breakpoints; ++i) { + if ((m_state.dbg.__bcr[i] & BCR_ENABLE) == 0) + break; // We found an available hw breakpoint slot (in i) + } + + // See if we found an available hw breakpoint slot above + if (i < num_hw_breakpoints) { + m_state.dbg.__bvr[i] = aligned_bp_address; + m_state.dbg.__bcr[i] = control_value; + + DNBLogThreadedIf(LOG_WATCHPOINTS, + "DNBArchMachARM64::EnableHardwareBreakpoint() " + "adding breakpoint on address 0x%llx with control " + "register value 0x%x", + (uint64_t)m_state.dbg.__bvr[i], + (uint32_t)m_state.dbg.__bcr[i]); + + // The kernel will set the MDE_ENABLE bit in the MDSCR_EL1 for us + // automatically, don't need to do it here. + kret = SetDBGState(also_set_on_task); + + DNBLogThreadedIf(LOG_WATCHPOINTS, + "DNBArchMachARM64::" + "EnableHardwareBreakpoint() " + "SetDBGState() => 0x%8.8x.", + kret); + + if (kret == KERN_SUCCESS) + return i; + } else { + DNBLogThreadedIf(LOG_WATCHPOINTS, + "DNBArchMachARM64::" + "EnableHardwareBreakpoint(): All " + "hardware resources (%u) are in use.", + num_hw_breakpoints); + } + } + return INVALID_NUB_HW_INDEX; +} + uint32_t DNBArchMachARM64::EnableHardwareWatchpoint(nub_addr_t addr, nub_size_t size, bool read, bool write, @@ -905,6 +1011,32 @@ bool DNBArchMachARM64::DisableHardwareWatchpoint_helper(uint32_t hw_index, return (kret == KERN_SUCCESS); } +bool DNBArchMachARM64::DisableHardwareBreakpoint(uint32_t hw_index, + bool also_set_on_task) { + kern_return_t kret = GetDBGState(false); + if (kret != KERN_SUCCESS) + return false; + + const uint32_t num_hw_points = NumSupportedHardwareBreakpoints(); + if (hw_index >= num_hw_points) + return false; + + m_disabled_breakpoints[hw_index].addr = m_state.dbg.__bvr[hw_index]; + m_disabled_breakpoints[hw_index].control = m_state.dbg.__bcr[hw_index]; + + m_state.dbg.__bcr[hw_index] = 0; + DNBLogThreadedIf(LOG_WATCHPOINTS, + "DNBArchMachARM64::" + "DisableHardwareBreakpoint( %u ) - WVR%u = " + "0x%8.8llx BCR%u = 0x%8.8llx", + hw_index, hw_index, (uint64_t)m_state.dbg.__bvr[hw_index], + hw_index, (uint64_t)m_state.dbg.__bcr[hw_index]); + + kret = SetDBGState(also_set_on_task); + + return (kret == KERN_SUCCESS); +} + // This is for checking the Byte Address Select bits in the DBRWCRn_EL1 control // register. // Returns -1 if the trailing bit patterns are not one of: diff --git a/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.h b/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.h index ea4efa48d02608..fafcb73837b723 100644 --- a/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.h +++ b/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.h @@ -26,10 +26,12 @@ class DNBArchMachARM64 : public DNBArchProtocol { DNBArchMachARM64(MachThread *thread) : m_thread(thread), m_state(), m_disabled_watchpoints(), - m_watchpoint_hw_index(-1), m_watchpoint_did_occur(false), + m_disabled_breakpoints(), m_watchpoint_hw_index(-1), + m_watchpoint_did_occur(false), m_watchpoint_resume_single_step_enabled(false), m_saved_register_states() { m_disabled_watchpoints.resize(16); + m_disabled_breakpoints.resize(16); memset(&m_dbg_save, 0, sizeof(m_dbg_save)); } @@ -62,7 +64,13 @@ class DNBArchMachARM64 : public DNBArchProtocol { static const uint8_t *SoftwareBreakpointOpcode(nub_size_t byte_size); static uint32_t GetCPUType(); + virtual uint32_t NumSupportedHardwareBreakpoints(); virtual uint32_t NumSupportedHardwareWatchpoints(); + + virtual uint32_t EnableHardwareBreakpoint(nub_addr_t addr, nub_size_t size, + bool also_set_on_task); + virtual bool DisableHardwareBreakpoint(uint32_t hw_break_index, + bool also_set_on_task); virtual uint32_t EnableHardwareWatchpoint(nub_addr_t addr, nub_size_t size, bool read, bool write, bool also_set_on_task); @@ -229,10 +237,11 @@ class DNBArchMachARM64 : public DNBArchProtocol { State m_state; arm_debug_state64_t m_dbg_save; - // arm64 doesn't keep the disabled watchpoint values in the debug register - // context like armv7; + // arm64 doesn't keep the disabled watchpoint and breakpoint values in the + // debug register context like armv7; // we need to save them aside when we disable them temporarily. std::vector m_disabled_watchpoints; + std::vector m_disabled_breakpoints; // The following member variables should be updated atomically. int32_t m_watchpoint_hw_index; diff --git a/lldb/unittests/Platform/PlatformDarwinTest.cpp b/lldb/unittests/Platform/PlatformDarwinTest.cpp index 06287c63227b60..20916f3cd12595 100644 --- a/lldb/unittests/Platform/PlatformDarwinTest.cpp +++ b/lldb/unittests/Platform/PlatformDarwinTest.cpp @@ -19,6 +19,7 @@ using namespace lldb_private; struct PlatformDarwinTester : public PlatformDarwin { public: + using PlatformDarwin::FindComponentInPath; using PlatformDarwin::FindXcodeContentsDirectoryInPath; static bool SDKSupportsModules(SDKType desired_type, const lldb_private::FileSpec &sdk_path) { @@ -132,3 +133,20 @@ TEST(PlatformDarwinTest, GetSDKNameForType) { EXPECT_EQ( "", PlatformDarwin::GetSDKNameForType(PlatformDarwin::SDKType::unknown)); } + +TEST(PlatformDarwinTest, FindComponentInPath) { + EXPECT_EQ("/path/to/foo", + PlatformDarwinTester::FindComponentInPath("/path/to/foo/", "foo")); + + EXPECT_EQ("/path/to/foo", + PlatformDarwinTester::FindComponentInPath("/path/to/foo", "foo")); + + EXPECT_EQ("/path/to/foobar", PlatformDarwinTester::FindComponentInPath( + "/path/to/foobar", "foo")); + + EXPECT_EQ("/path/to/foobar", PlatformDarwinTester::FindComponentInPath( + "/path/to/foobar", "bar")); + + EXPECT_EQ("", + PlatformDarwinTester::FindComponentInPath("/path/to/foo", "bar")); +} diff --git a/lldb/unittests/Utility/FileSpecTest.cpp b/lldb/unittests/Utility/FileSpecTest.cpp index c66edc44479784..690c5ae331ee25 100644 --- a/lldb/unittests/Utility/FileSpecTest.cpp +++ b/lldb/unittests/Utility/FileSpecTest.cpp @@ -441,3 +441,9 @@ TEST(FileSpecTest, Yaml) { EXPECT_EQ(deserialized.GetDirectory(), fs_windows.GetDirectory()); EXPECT_EQ(deserialized, fs_windows); } + +TEST(FileSpecTest, OperatorBool) { + EXPECT_FALSE(FileSpec()); + EXPECT_FALSE(FileSpec("")); + EXPECT_TRUE(FileSpec("/foo/bar")); +} diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index 3bb037b803dea8..a70178ee8bed3d 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -418,7 +418,7 @@ option(LLVM_ENABLE_EXPENSIVE_CHECKS "Enable expensive checks" OFF) # Enabling this flag makes it easier to find cases where the compiler makes # assumptions on the size being 'fixed size', when building tests for # SVE/SVE2 or other scalable vector architectures. -option(LLVM_ENABLE_STRICT_IMPLICIT_CONVERSION_TYPESIZE +option(LLVM_ENABLE_STRICT_FIXED_SIZE_VECTORS "Enable assertions that type is not scalable in implicit conversion from TypeSize to uint64_t" OFF) set(LLVM_ABI_BREAKING_CHECKS "WITH_ASSERTS" CACHE STRING diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake index 70ad34a41bde84..2563ee430174b1 100644 --- a/llvm/cmake/modules/HandleLLVMOptions.cmake +++ b/llvm/cmake/modules/HandleLLVMOptions.cmake @@ -95,8 +95,8 @@ if(LLVM_ENABLE_EXPENSIVE_CHECKS) endif() endif() -if (LLVM_ENABLE_STRICT_IMPLICIT_CONVERSION_TYPESIZE) - add_definitions(-DSTRICT_IMPLICIT_CONVERSION_TYPESIZE) +if (LLVM_ENABLE_STRICT_FIXED_SIZE_VECTORS) + add_definitions(-DSTRICT_FIXED_SIZE_VECTORS) endif() string(TOUPPER "${LLVM_ABI_BREAKING_CHECKS}" uppercase_LLVM_ABI_BREAKING_CHECKS) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 3d1f9d38831991..313d1d404f7e0d 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -3478,9 +3478,12 @@ the ``nsw`` flag. Poison value behavior is defined in terms of value *dependence*: -- Values other than :ref:`phi ` nodes depend on their operands. +- Values other than :ref:`phi ` nodes and :ref:`select ` + instructions depend on their operands. - :ref:`Phi ` nodes depend on the operand corresponding to their dynamic predecessor basic block. +- Select instructions depend on their condition operand and their + selected operand. - Function arguments depend on the corresponding actual argument values in the dynamic callers of their functions. - :ref:`Call ` instructions depend on the :ref:`ret ` @@ -7724,6 +7727,8 @@ Example: = fadd float 4.0, %var ; yields float:result = 4.0 + %var +.. _i_sub: + '``sub``' Instruction ^^^^^^^^^^^^^^^^^^^^^ @@ -7819,6 +7824,8 @@ Example: = fsub float 4.0, %var ; yields float:result = 4.0 - %var = fsub float -0.0, %val ; yields float:result = -%var +.. _i_mul: + '``mul``' Instruction ^^^^^^^^^^^^^^^^^^^^^ @@ -7913,6 +7920,8 @@ Example: = fmul float 4.0, %var ; yields float:result = 4.0 * %var +.. _i_udiv: + '``udiv``' Instruction ^^^^^^^^^^^^^^^^^^^^^^ @@ -7959,6 +7968,8 @@ Example: = udiv i32 4, %var ; yields i32:result = 4 / %var +.. _i_sdiv: + '``sdiv``' Instruction ^^^^^^^^^^^^^^^^^^^^^^ @@ -8047,6 +8058,8 @@ Example: = fdiv float 4.0, %var ; yields float:result = 4.0 / %var +.. _i_urem: + '``urem``' Instruction ^^^^^^^^^^^^^^^^^^^^^^ @@ -8091,6 +8104,8 @@ Example: = urem i32 4, %var ; yields i32:result = 4 % %var +.. _i_srem: + '``srem``' Instruction ^^^^^^^^^^^^^^^^^^^^^^ @@ -8204,6 +8219,8 @@ commonly be strength reduced from other instructions. They require two operands of the same type, execute an operation on them, and produce a single value. The resulting value is the same type as its operands. +.. _i_shl: + '``shl``' Instruction ^^^^^^^^^^^^^^^^^^^^^ @@ -8256,6 +8273,9 @@ Example: = shl i32 1, 32 ; undefined = shl <2 x i32> < i32 1, i32 1>, < i32 1, i32 2> ; yields: result=<2 x i32> < i32 2, i32 4> +.. _i_lshr: + + '``lshr``' Instruction ^^^^^^^^^^^^^^^^^^^^^^ @@ -8305,6 +8325,8 @@ Example: = lshr i32 1, 32 ; undefined = lshr <2 x i32> < i32 -2, i32 4>, < i32 1, i32 2> ; yields: result=<2 x i32> < i32 0x7FFFFFFF, i32 1> +.. _i_ashr: + '``ashr``' Instruction ^^^^^^^^^^^^^^^^^^^^^^ @@ -8355,6 +8377,8 @@ Example: = ashr i32 1, 32 ; undefined = ashr <2 x i32> < i32 -2, i32 4>, < i32 1, i32 3> ; yields: result=<2 x i32> < i32 -1, i32 0> +.. _i_and: + '``and``' Instruction ^^^^^^^^^^^^^^^^^^^^^ @@ -8404,6 +8428,8 @@ Example: = and i32 15, 40 ; yields i32:result = 8 = and i32 4, 8 ; yields i32:result = 0 +.. _i_or: + '``or``' Instruction ^^^^^^^^^^^^^^^^^^^^ @@ -8453,6 +8479,8 @@ Example: = or i32 15, 40 ; yields i32:result = 47 = or i32 4, 8 ; yields i32:result = 12 +.. _i_xor: + '``xor``' Instruction ^^^^^^^^^^^^^^^^^^^^^ @@ -15256,6 +15284,678 @@ intrinsic returns the executable address corresponding to ``tramp`` after performing the required machine specific adjustments. The pointer returned can then be :ref:`bitcast and executed `. + +.. _int_vp: + +Vector Predication Intrinsics +----------------------------- +VP intrinsics are intended for predicated SIMD/vector code. A typical VP +operation takes a vector mask and an explicit vector length parameter as in: + +:: + + llvm.vp..*( %x, %y, %mask, i32 %evl) + +The vector mask parameter (%mask) always has a vector of `i1` type, for example +`<32 x i1>`. The explicit vector length parameter always has the type `i32` and +is an unsigned integer value. The explicit vector length parameter (%evl) is in +the range: + +:: + + 0 <= %evl <= W, where W is the number of vector elements + +Note that for :ref:`scalable vector types ` ``W`` is the runtime +length of the vector. + +The VP intrinsic has undefined behavior if ``%evl > W``. The explicit vector +length (%evl) creates a mask, %EVLmask, with all elements ``0 <= i < %evl`` set +to True, and all other lanes ``%evl <= i < W`` to False. A new mask %M is +calculated with an element-wise AND from %mask and %EVLmask: + +:: + + M = %mask AND %EVLmask + +A vector operation ```` on vectors ``A`` and ``B`` calculates: + +:: + + A B = { A[i] B[i] M[i] = True, and + { undef otherwise + +Optimization Hint +^^^^^^^^^^^^^^^^^ + +Some targets, such as AVX512, do not support the %evl parameter in hardware. +The use of an effective %evl is discouraged for those targets. The function +``TargetTransformInfo::hasActiveVectorLength()`` returns true when the target +has native support for %evl. + + +.. _int_vp_add: + +'``llvm.vp.add.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.add.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.add.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.add.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated integer addition of two vectors of integers. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.add``' intrinsic performs integer addition (:ref:`add `) +of the first and second vector operand on each enabled lane. The result on +disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = add <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + +.. _int_vp_sub: + +'``llvm.vp.sub.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.sub.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.sub.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.sub.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated integer subtraction of two vectors of integers. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.sub``' intrinsic performs integer subtraction +(:ref:`sub `) of the first and second vector operand on each enabled +lane. The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.sub.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = sub <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + + +.. _int_vp_mul: + +'``llvm.vp.mul.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.mul.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.mul.nxv46i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.mul.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated integer multiplication of two vectors of integers. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" +The '``llvm.vp.mul``' intrinsic performs integer multiplication +(:ref:`mul `) of the first and second vector operand on each enabled +lane. The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.mul.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = mul <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + +.. _int_vp_sdiv: + +'``llvm.vp.sdiv.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.sdiv.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.sdiv.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.sdiv.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated, signed division of two vectors of integers. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.sdiv``' intrinsic performs signed division (:ref:`sdiv `) +of the first and second vector operand on each enabled lane. The result on +disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = sdiv <4 x i32> %a, %b + %also.r = select <4 x ii> %mask, <4 x i32> %t, <4 x i32> undef + + +.. _int_vp_udiv: + +'``llvm.vp.udiv.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.udiv.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.udiv.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.udiv.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated, unsigned division of two vectors of integers. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The third operand is the vector mask and has the same number of elements as the result vector type. The fourth operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.udiv``' intrinsic performs unsigned division +(:ref:`udiv `) of the first and second vector operand on each enabled +lane. The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = udiv <4 x i32> %a, %b + %also.r = select <4 x ii> %mask, <4 x i32> %t, <4 x i32> undef + + + +.. _int_vp_srem: + +'``llvm.vp.srem.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.srem.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.srem.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.srem.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated computations of the signed remainder of two integer vectors. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.srem``' intrinsic computes the remainder of the signed division +(:ref:`srem `) of the first and second vector operand on each enabled +lane. The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.srem.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = srem <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + + +.. _int_vp_urem: + +'``llvm.vp.urem.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.urem.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.urem.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.urem.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated computation of the unsigned remainder of two integer vectors. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.urem``' intrinsic computes the remainder of the unsigned division +(:ref:`urem `) of the first and second vector operand on each enabled +lane. The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.urem.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = urem <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + +.. _int_vp_ashr: + +'``llvm.vp.ashr.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.ashr.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.ashr.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.ashr.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Vector-predicated arithmetic right-shift. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.ashr``' intrinsic computes the arithmetic right shift +(:ref:`ashr `) of the first operand by the second operand on each +enabled lane. The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.ashr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = ashr <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + +.. _int_vp_lshr: + + +'``llvm.vp.lshr.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.lshr.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.lshr.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.lshr.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Vector-predicated logical right-shift. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.lshr``' intrinsic computes the logical right shift +(:ref:`lshr `) of the first operand by the second operand on each +enabled lane. The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = lshr <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + +.. _int_vp_shl: + +'``llvm.vp.shl.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.shl.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.shl.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.shl.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Vector-predicated left shift. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.shl``' intrinsic computes the left shift (:ref:`shl `) of +the first operand by the second operand on each enabled lane. The result on +disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.shl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = shl <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + +.. _int_vp_or: + +'``llvm.vp.or.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.or.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.or.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.or.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Vector-predicated or. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.or``' intrinsic performs a bitwise or (:ref:`or `) of the +first two operands on each enabled lane. The result on disabled lanes is +undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.or.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = or <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + +.. _int_vp_and: + +'``llvm.vp.and.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.and.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.and.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.and.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Vector-predicated and. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.and``' intrinsic performs a bitwise and (:ref:`and `) of +the first two operands on each enabled lane. The result on disabled lanes is +undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.and.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = and <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + +.. _int_vp_xor: + +'``llvm.vp.xor.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.xor.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.xor.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.xor.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Vector-predicated, bitwise xor. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.xor``' intrinsic performs a bitwise xor (:ref:`xor `) of +the first two operands on each enabled lane. +The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.xor.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = xor <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + .. _int_mload_mstore: Masked Vector Load and Store Intrinsics diff --git a/llvm/include/llvm/Analysis/MemoryLocation.h b/llvm/include/llvm/Analysis/MemoryLocation.h index dd576e039151d9..446041119c8078 100644 --- a/llvm/include/llvm/Analysis/MemoryLocation.h +++ b/llvm/include/llvm/Analysis/MemoryLocation.h @@ -89,6 +89,11 @@ class LocationSize { : Value(Raw > MaxValue ? Unknown : Raw) {} static LocationSize precise(uint64_t Value) { return LocationSize(Value); } + static LocationSize precise(TypeSize Value) { + if (Value.isScalable()) + return unknown(); + return precise(Value.getFixedSize()); + } static LocationSize upperBound(uint64_t Value) { // You can't go lower than 0, so give a precise result. @@ -98,6 +103,11 @@ class LocationSize { return unknown(); return LocationSize(Value | ImpreciseBit, Direct); } + static LocationSize upperBound(TypeSize Value) { + if (Value.isScalable()) + return unknown(); + return upperBound(Value.getFixedSize()); + } constexpr static LocationSize unknown() { return LocationSize(Unknown, Direct); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index ca6892b14ef3b2..ce04592bf53eae 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1167,6 +1167,15 @@ class TargetTransformInfo { /// to a stack reload. unsigned getGISelRematGlobalCost() const; + /// \name Vector Predication Information + /// @{ + /// Whether the target supports the %evl parameter of VP intrinsic efficiently in hardware. + /// (see LLVM Language Reference - "Vector Predication Intrinsics") + /// Use of %evl is discouraged when that is not the case. + bool hasActiveVectorLength() const; + + /// @} + /// @} private: @@ -1420,6 +1429,7 @@ class TargetTransformInfo::Concept { ReductionFlags) const = 0; virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0; virtual unsigned getGISelRematGlobalCost() const = 0; + virtual bool hasActiveVectorLength() const = 0; virtual int getInstructionLatency(const Instruction *I) = 0; }; @@ -1913,6 +1923,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.getGISelRematGlobalCost(); } + bool hasActiveVectorLength() const override { + return Impl.hasActiveVectorLength(); + } + int getInstructionLatency(const Instruction *I) override { return Impl.getInstructionLatency(I); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 2acb88a6a83d13..765d35a05a46ca 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -628,6 +628,10 @@ class TargetTransformInfoImplBase { return 1; } + bool hasActiveVectorLength() const { + return false; + } + protected: // Obtain the minimum required size to hold the value (without the sign) // In case of a vector it returns the min required size for one element. diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 1d144688a006e3..d47dddf88f5163 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -190,6 +190,19 @@ class CombinerHelper { bool applyCombineShiftToUnmerge(MachineInstr &MI, const unsigned &ShiftVal); bool tryCombineShiftToUnmerge(MachineInstr &MI, unsigned TargetShiftAmount); + /// Return true if any explicit use operand on \p MI is defined by a + /// G_IMPLICIT_DEF. + bool matchAnyExplicitUseIsUndef(MachineInstr &MI); + + /// Replace an instruction with a G_FCONSTANT with value \p C. + bool replaceInstWithFConstant(MachineInstr &MI, double C); + + /// Replace an instruction with a G_CONSTANT with value \p C. + bool replaceInstWithConstant(MachineInstr &MI, int64_t C); + + /// Replace an instruction with a G_IMPLICIT_DEF. + bool replaceInstWithUndef(MachineInstr &MI); + /// Try to transform \p MI by using all of the above /// combine functions. Returns true if changed. bool tryCombine(MachineInstr &MI); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/RegisterBank.h b/llvm/include/llvm/CodeGen/GlobalISel/RegisterBank.h index f528d1a460128c..8a8d3ce200409a 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/RegisterBank.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/RegisterBank.h @@ -29,18 +29,23 @@ class RegisterBank { private: unsigned ID; const char *Name; - unsigned Size; + const unsigned *Sizes; BitVector ContainedRegClasses; - /// Sentinel value used to recognize register bank not properly + /// HwMode of the target. Not initialized by the constructor, initialized + /// within generated RegisterBankInfo class constructor. + unsigned HwMode; + + /// Sentinel values used to recognize register bank not properly /// initialized yet. static const unsigned InvalidID; + static const unsigned InvalidHwMode; /// Only the RegisterBankInfo can initialize RegisterBank properly. friend RegisterBankInfo; public: - RegisterBank(unsigned ID, const char *Name, unsigned Size, + RegisterBank(unsigned ID, const char *Name, const unsigned *Sizes, const uint32_t *CoveredClasses, unsigned NumRegClasses); /// Get the identifier of this register bank. @@ -51,7 +56,7 @@ class RegisterBank { const char *getName() const { return Name; } /// Get the maximal size in bits that fits in this register bank. - unsigned getSize() const { return Size; } + unsigned getSize() const { return Sizes[HwMode]; } /// Check whether this instance is ready to be used. bool isValid() const; diff --git a/llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h index 8725d96efd8214..b86d2d10322f0d 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h @@ -415,7 +415,8 @@ class RegisterBankInfo { /// Create a RegisterBankInfo that can accommodate up to \p NumRegBanks /// RegisterBank instances. - RegisterBankInfo(RegisterBank **RegBanks, unsigned NumRegBanks); + RegisterBankInfo(RegisterBank **RegBanks, unsigned NumRegBanks, + unsigned HwMode); /// This constructor is meaningless. /// It just provides a default constructor that can be used at link time diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 993a3c8bbf95fc..3cf7b6a3468057 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -935,7 +935,7 @@ class SelectionDAG { SDValue getNode(unsigned Opcode, const SDLoc &DL, ArrayRef ResultTys, ArrayRef Ops); SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, - ArrayRef Ops); + ArrayRef Ops, const SDNodeFlags Flags = SDNodeFlags()); // Specialize based on number of operands. SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT); diff --git a/llvm/include/llvm/CodeGen/ValueTypes.h b/llvm/include/llvm/CodeGen/ValueTypes.h index bcf4177629200f..15a4bfe1e5553f 100644 --- a/llvm/include/llvm/CodeGen/ValueTypes.h +++ b/llvm/include/llvm/CodeGen/ValueTypes.h @@ -19,6 +19,7 @@ #include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/TypeSize.h" +#include "llvm/Support/WithColor.h" #include #include #include @@ -75,9 +76,7 @@ namespace llvm { MVT M = MVT::getVectorVT(VT.V, NumElements, IsScalable); if (M.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE) return M; - - assert(!IsScalable && "We don't support extended scalable types yet"); - return getExtendedVectorVT(Context, VT, NumElements); + return getExtendedVectorVT(Context, VT, NumElements, IsScalable); } /// Returns the EVT that represents a vector EC.Min elements in length, @@ -86,19 +85,15 @@ namespace llvm { MVT M = MVT::getVectorVT(VT.V, EC); if (M.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE) return M; - assert (!EC.Scalable && "We don't support extended scalable types yet"); - return getExtendedVectorVT(Context, VT, EC.Min); + return getExtendedVectorVT(Context, VT, EC); } /// Return a vector with the same number of elements as this vector, but /// with the element type converted to an integer type with the same /// bitwidth. EVT changeVectorElementTypeToInteger() const { - if (!isSimple()) { - assert (!isScalableVector() && - "We don't support extended scalable types yet"); + if (!isSimple()) return changeExtendedVectorElementTypeToInteger(); - } MVT EltTy = getSimpleVT().getVectorElementType(); unsigned BitWidth = EltTy.getSizeInBits(); MVT IntTy = MVT::getIntegerVT(BitWidth); @@ -155,12 +150,12 @@ namespace llvm { /// Return true if this is a vector type where the runtime /// length is machine dependent bool isScalableVector() const { - // FIXME: We don't support extended scalable types yet, because the - // matching IR type doesn't exist. Once it has been added, this can - // be changed to call isExtendedScalableVector. - if (!isSimple()) - return false; - return V.isScalableVector(); + return isSimple() ? V.isScalableVector() : isExtendedScalableVector(); + } + + bool isFixedLengthVector() const { + return isSimple() ? V.isFixedLengthVector() + : isExtendedFixedLengthVector(); } /// Return true if this is a 16-bit vector type. @@ -273,7 +268,16 @@ namespace llvm { /// Given a vector type, return the number of elements it contains. unsigned getVectorNumElements() const { +#ifdef STRICT_FIXED_SIZE_VECTORS + assert(isFixedLengthVector() && "Invalid vector type!"); +#else assert(isVector() && "Invalid vector type!"); + if (isScalableVector()) + WithColor::warning() + << "Possible incorrect use of EVT::getVectorNumElements() for " + "scalable vector. Scalable flag may be dropped, use" + "EVT::getVectorElementCount() instead\n"; +#endif if (isSimple()) return V.getVectorNumElements(); return getExtendedVectorNumElements(); @@ -285,9 +289,7 @@ namespace llvm { if (isSimple()) return V.getVectorElementCount(); - assert(!isScalableVector() && - "We don't support extended scalable types yet"); - return {getExtendedVectorNumElements(), false}; + return {getExtendedVectorNumElements(), isExtendedScalableVector()}; } /// Return the size of the specified value type in bits. @@ -428,8 +430,10 @@ namespace llvm { EVT changeExtendedTypeToInteger() const; EVT changeExtendedVectorElementTypeToInteger() const; static EVT getExtendedIntegerVT(LLVMContext &C, unsigned BitWidth); - static EVT getExtendedVectorVT(LLVMContext &C, EVT VT, - unsigned NumElements); + static EVT getExtendedVectorVT(LLVMContext &C, EVT VT, unsigned NumElements, + bool IsScalable); + static EVT getExtendedVectorVT(LLVMContext &Context, EVT VT, + ElementCount EC); bool isExtendedFloatingPoint() const LLVM_READONLY; bool isExtendedInteger() const LLVM_READONLY; bool isExtendedScalarInteger() const LLVM_READONLY; @@ -442,8 +446,11 @@ namespace llvm { bool isExtended512BitVector() const LLVM_READONLY; bool isExtended1024BitVector() const LLVM_READONLY; bool isExtended2048BitVector() const LLVM_READONLY; + bool isExtendedFixedLengthVector() const LLVM_READONLY; + bool isExtendedScalableVector() const LLVM_READONLY; EVT getExtendedVectorElementType() const; unsigned getExtendedVectorNumElements() const LLVM_READONLY; + ElementCount getExtendedVectorElementCount() const LLVM_READONLY; TypeSize getExtendedSizeInBits() const LLVM_READONLY; }; diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h index f869ebdfbe4e76..15fe079eccafde 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h @@ -143,12 +143,15 @@ class MachOPlatform : public Platform { MachOJITDylibInitializers::SectionExtent ObjCSelRefs, MachOJITDylibInitializers::SectionExtent ObjCClassList); - std::mutex PlatformMutex; ExecutionSession &ES; ObjectLinkingLayer &ObjLinkingLayer; std::unique_ptr StandardSymbolsObject; DenseMap RegisteredInitSymbols; + + // InitSeqs gets its own mutex to avoid locking the whole session when + // aggregating data from the jitlink. + std::mutex InitSeqsMutex; DenseMap InitSeqs; }; diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h index 5601e9a7518554..29886bf458a929 100644 --- a/llvm/include/llvm/IR/Constants.h +++ b/llvm/include/llvm/IR/Constants.h @@ -392,7 +392,7 @@ class ConstantAggregateZero final : public ConstantData { /// use operands. class ConstantAggregate : public Constant { protected: - ConstantAggregate(CompositeType *T, ValueTy VT, ArrayRef V); + ConstantAggregate(Type *T, ValueTy VT, ArrayRef V); public: /// Transparently provide more efficient getOperand methods. diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h index 3b56da680c6e2c..ac3abe3c32dc66 100644 --- a/llvm/include/llvm/IR/DerivedTypes.h +++ b/llvm/include/llvm/IR/DerivedTypes.h @@ -195,26 +195,6 @@ class FunctionCallee { Value *Callee = nullptr; }; -/// Common super class of ArrayType, StructType and VectorType. -class CompositeType : public Type { -protected: - explicit CompositeType(LLVMContext &C, TypeID tid) : Type(C, tid) {} - -public: - /// Given an index value into the type, return the type of the element. - Type *getTypeAtIndex(const Value *V) const; - Type *getTypeAtIndex(unsigned Idx) const; - bool indexValid(const Value *V) const; - bool indexValid(unsigned Idx) const; - - /// Methods for support type inquiry through isa, cast, and dyn_cast. - static bool classof(const Type *T) { - return T->getTypeID() == ArrayTyID || - T->getTypeID() == StructTyID || - T->getTypeID() == VectorTyID; - } -}; - /// Class to represent struct types. There are two different kinds of struct /// types: Literal structs and Identified structs. /// @@ -235,8 +215,8 @@ class CompositeType : public Type { /// elements as defined by DataLayout (which is required to match what the code /// generator for a target expects). /// -class StructType : public CompositeType { - StructType(LLVMContext &C) : CompositeType(C, StructTyID) {} +class StructType : public Type { + StructType(LLVMContext &C) : Type(C, StructTyID) {} enum { /// This is the contents of the SubClassData field. @@ -350,6 +330,11 @@ class StructType : public CompositeType { assert(N < NumContainedTys && "Element number out of range!"); return ContainedTys[N]; } + /// Given an index value into the type, return the type of the element. + Type *getTypeAtIndex(const Value *V) const; + Type *getTypeAtIndex(unsigned N) const { return getElementType(N); } + bool indexValid(const Value *V) const; + bool indexValid(unsigned Idx) const { return Idx < getNumElements(); } /// Methods for support type inquiry through isa, cast, and dyn_cast. static bool classof(const Type *T) { @@ -375,14 +360,14 @@ Type *Type::getStructElementType(unsigned N) const { /// for use of SIMD instructions. SequentialType holds the common features of /// both, which stem from the fact that both lay their components out in memory /// identically. -class SequentialType : public CompositeType { +class SequentialType : public Type { Type *ContainedType; ///< Storage for the single contained type. uint64_t NumElements; protected: SequentialType(TypeID TID, Type *ElType, uint64_t NumElements) - : CompositeType(ElType->getContext(), TID), ContainedType(ElType), - NumElements(NumElements) { + : Type(ElType->getContext(), TID), ContainedType(ElType), + NumElements(NumElements) { ContainedTys = &ContainedType; NumContainedTys = 1; } diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h index e558d4317efc3b..713624d13bef0c 100644 --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -1008,16 +1008,23 @@ class GetElementPtrInst : public Instruction { return getPointerAddressSpace(); } - /// Returns the type of the element that would be loaded with - /// a load instruction with the specified parameters. + /// Returns the result type of a getelementptr with the given source + /// element type and indexes. /// /// Null is returned if the indices are invalid for the specified - /// pointer type. - /// + /// source element type. static Type *getIndexedType(Type *Ty, ArrayRef IdxList); static Type *getIndexedType(Type *Ty, ArrayRef IdxList); static Type *getIndexedType(Type *Ty, ArrayRef IdxList); + /// Return the type of the element at the given index of an indexable + /// type. This is equivalent to "getIndexedType(Agg, {Zero, Idx})". + /// + /// Returns null if the type can't be indexed, or the given index is not + /// legal for the given type. + static Type *getTypeAtIndex(Type *Ty, Value *Idx); + static Type *getTypeAtIndex(Type *Ty, uint64_t Idx); + inline op_iterator idx_begin() { return op_begin()+1; } inline const_op_iterator idx_begin() const { return op_begin()+1; } inline op_iterator idx_end() { return op_end(); } diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h index cebe07e42afc75..0ca1688a7c91fd 100644 --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -206,6 +206,48 @@ namespace llvm { /// @} }; + /// This is the common base class for vector predication intrinsics. + class VPIntrinsic : public IntrinsicInst { + public: + static Optional GetMaskParamPos(Intrinsic::ID IntrinsicID); + static Optional GetVectorLengthParamPos(Intrinsic::ID IntrinsicID); + + /// The llvm.vp.* intrinsics for this instruction Opcode + static Intrinsic::ID GetForOpcode(unsigned OC); + + // Whether \p ID is a VP intrinsic ID. + static bool IsVPIntrinsic(Intrinsic::ID); + + /// \return the mask parameter or nullptr. + Value *getMaskParam() const; + + /// \return the vector length parameter or nullptr. + Value *getVectorLengthParam() const; + + /// \return whether the vector length param can be ignored. + bool canIgnoreVectorLengthParam() const; + + /// \return the static element count (vector number of elements) the vector + /// length parameter applies to. + ElementCount getStaticVectorLength() const; + + // Methods for support type inquiry through isa, cast, and dyn_cast: + static bool classof(const IntrinsicInst *I) { + return IsVPIntrinsic(I->getIntrinsicID()); + } + static bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } + + // Equivalent non-predicated opcode + unsigned getFunctionalOpcode() const { + return GetFunctionalOpcodeForVP(getIntrinsicID()); + } + + // Equivalent non-predicated opcode + static unsigned GetFunctionalOpcodeForVP(Intrinsic::ID ID); + }; + /// This is the common base class for constrained floating point intrinsics. class ConstrainedFPIntrinsic : public IntrinsicInst { public: diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 797d7b1765c3dd..0812d707e4feda 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -27,6 +27,10 @@ class IntrinsicProperty; // effects. It may be CSE'd deleted if dead, etc. def IntrNoMem : IntrinsicProperty; +// IntrNoSync - Threads executing the intrinsic will not synchronize using +// memory or other means. +def IntrNoSync : IntrinsicProperty; + // IntrReadMem - This intrinsic only reads from memory. It does not write to // memory and has no other side effects. Therefore, it cannot be moved across // potentially aliasing stores. However, it can be reordered otherwise and can @@ -1153,6 +1157,79 @@ def int_is_constant : Intrinsic<[llvm_i1_ty], [llvm_any_ty], [IntrNoMem, IntrWil def int_ptrmask: Intrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty, llvm_anyint_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; +//===---------------- Vector Predication Intrinsics --------------===// + +// Binary operators +let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in { + def int_vp_add : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_sub : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_mul : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_sdiv : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_udiv : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_srem : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_urem : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_ashr : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_lshr : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_shl : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_or : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_and : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_xor : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + +} + + //===-------------------------- Masked Intrinsics -------------------------===// // def int_masked_store : Intrinsic<[], [llvm_anyvector_ty, diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index a6de3f949199e2..50f26a7fd6d3ca 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1302,29 +1302,29 @@ def int_aarch64_sve_prf // Scalar + 32-bit scaled offset vector, zero extend, packed and // unpacked. -def int_aarch64_sve_gather_prfb_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; -def int_aarch64_sve_gather_prfh_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; -def int_aarch64_sve_gather_prfw_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; -def int_aarch64_sve_gather_prfd_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_prfb_gather_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_prfh_gather_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_prfw_gather_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_prfd_gather_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; // Scalar + 32-bit scaled offset vector, sign extend, packed and // unpacked. -def int_aarch64_sve_gather_prfb_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; -def int_aarch64_sve_gather_prfw_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; -def int_aarch64_sve_gather_prfh_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; -def int_aarch64_sve_gather_prfd_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_prfb_gather_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_prfw_gather_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_prfh_gather_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_prfd_gather_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; // Scalar + 64-bit scaled offset vector. -def int_aarch64_sve_gather_prfb_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled; -def int_aarch64_sve_gather_prfh_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled; -def int_aarch64_sve_gather_prfw_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled; -def int_aarch64_sve_gather_prfd_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_prfb_gather_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_prfh_gather_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_prfw_gather_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_prfd_gather_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled; // Vector + scalar. -def int_aarch64_sve_gather_prfb : SVE_gather_prf_vector_base_scalar_offset; -def int_aarch64_sve_gather_prfh : SVE_gather_prf_vector_base_scalar_offset; -def int_aarch64_sve_gather_prfw : SVE_gather_prf_vector_base_scalar_offset; -def int_aarch64_sve_gather_prfd : SVE_gather_prf_vector_base_scalar_offset; +def int_aarch64_sve_prfb_gather : SVE_gather_prf_vector_base_scalar_offset; +def int_aarch64_sve_prfh_gather : SVE_gather_prf_vector_base_scalar_offset; +def int_aarch64_sve_prfw_gather : SVE_gather_prf_vector_base_scalar_offset; +def int_aarch64_sve_prfd_gather : SVE_gather_prf_vector_base_scalar_offset; // // Scalar to vector operations diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td index 00251e52ca248a..80ed0792a209c2 100644 --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -1268,6 +1268,11 @@ defm int_arm_mve_vqrdmlash: MVEPredicated<[llvm_anyvector_ty], [LLVMMatchType<0> /* mult op #1 */, LLVMMatchType<0> /* mult op #2 */, llvm_i32_ty /* addend (scalar) */]>; +defm int_arm_mve_vqdmlad: MVEPredicated<[llvm_anyvector_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, + llvm_i32_ty /* exchange */, llvm_i32_ty /* round */, + llvm_i32_ty /* subtract */]>; + // CDE (Custom Datapath Extension) def int_arm_cde_cx1: Intrinsic< diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def new file mode 100644 index 00000000000000..d3e1fc854373da --- /dev/null +++ b/llvm/include/llvm/IR/VPIntrinsics.def @@ -0,0 +1,84 @@ +//===-- IR/VPIntrinsics.def - Describes llvm.vp.* Intrinsics -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains descriptions of the various Vector Predication intrinsics. +// This is used as a central place for enumerating the different instructions +// and should eventually be the place to put comments about the instructions. +// +//===----------------------------------------------------------------------===// + +// NOTE: NO INCLUDE GUARD DESIRED! + +// Provide definitions of macros so that users of this file do not have to +// define everything to use it... +// +#ifndef REGISTER_VP_INTRINSIC +#define REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS) +#endif + +// Map this VP intrinsic to its functional Opcode +#ifndef HANDLE_VP_TO_OC +#define HANDLE_VP_TO_OC(VPID, OC) +#endif + +///// Integer Arithmetic ///// + +// llvm.vp.add(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_add, 2, 3) +HANDLE_VP_TO_OC(vp_add, Add) + +// llvm.vp.and(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_and, 2, 3) +HANDLE_VP_TO_OC(vp_and, And) + +// llvm.vp.ashr(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_ashr, 2, 3) +HANDLE_VP_TO_OC(vp_ashr, AShr) + +// llvm.vp.lshr(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_lshr, 2, 3) +HANDLE_VP_TO_OC(vp_lshr, LShr) + +// llvm.vp.mul(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_mul, 2, 3) +HANDLE_VP_TO_OC(vp_mul, Mul) + +// llvm.vp.or(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_or, 2, 3) +HANDLE_VP_TO_OC(vp_or, Or) + +// llvm.vp.sdiv(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_sdiv, 2, 3) +HANDLE_VP_TO_OC(vp_sdiv, SDiv) + +// llvm.vp.shl(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_shl, 2, 3) +HANDLE_VP_TO_OC(vp_shl, Shl) + +// llvm.vp.srem(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_srem, 2, 3) +HANDLE_VP_TO_OC(vp_srem, SRem) + +// llvm.vp.sub(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_sub, 2, 3) +HANDLE_VP_TO_OC(vp_sub, Sub) + +// llvm.vp.udiv(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_udiv, 2, 3) +HANDLE_VP_TO_OC(vp_udiv, UDiv) + +// llvm.vp.urem(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_urem, 2, 3) +HANDLE_VP_TO_OC(vp_urem, URem) + +// llvm.vp.xor(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_xor, 2, 3) +HANDLE_VP_TO_OC(vp_xor, Xor) + +#undef REGISTER_VP_INTRINSIC +#undef HANDLE_VP_TO_OC diff --git a/llvm/include/llvm/Support/GenericDomTreeConstruction.h b/llvm/include/llvm/Support/GenericDomTreeConstruction.h index 7c0278e8770eed..571550012374c9 100644 --- a/llvm/include/llvm/Support/GenericDomTreeConstruction.h +++ b/llvm/include/llvm/Support/GenericDomTreeConstruction.h @@ -7,11 +7,11 @@ //===----------------------------------------------------------------------===// /// \file /// -/// Generic dominator tree construction - This file provides routines to +/// Generic dominator tree construction - this file provides routines to /// construct immediate dominator information for a flow-graph based on the /// Semi-NCA algorithm described in this dissertation: /// -/// Linear-Time Algorithms for Dominators and Related Problems +/// [1] Linear-Time Algorithms for Dominators and Related Problems /// Loukas Georgiadis, Princeton University, November 2005, pp. 21-23: /// ftp://ftp.cs.princeton.edu/reports/2005/737.pdf /// @@ -20,13 +20,15 @@ /// /// O(n^2) worst cases happen when the computation of nearest common ancestors /// requires O(n) average time, which is very unlikely in real world. If this -/// ever turns out to be an issue, consider implementing a hybrid algorithm. +/// ever turns out to be an issue, consider implementing a hybrid algorithm +/// that uses SLT to perform full constructions and SemiNCA for incremental +/// updates. /// /// The file uses the Depth Based Search algorithm to perform incremental /// updates (insertion and deletions). The implemented algorithm is based on /// this publication: /// -/// An Experimental Study of Dynamic Dominators +/// [2] An Experimental Study of Dynamic Dominators /// Loukas Georgiadis, et al., April 12 2016, pp. 5-7, 9-10: /// https://arxiv.org/pdf/1604.02711.pdf /// @@ -732,7 +734,7 @@ struct SemiNCAInfo { LLVM_DEBUG(dbgs() << "Roots are different in updated trees\n" << "The entire tree needs to be rebuilt\n"); // It may be possible to update the tree without recalculating it, but - // we do not know yet how to do it, and it happens rarely in practise. + // we do not know yet how to do it, and it happens rarely in practice. CalculateFromScratch(DT, BUI); } } @@ -757,13 +759,13 @@ struct SemiNCAInfo { LLVM_DEBUG(dbgs() << "\t\tNCA == " << BlockNamePrinter(NCD) << "\n"); const unsigned NCDLevel = NCD->getLevel(); - // Based on Lemma 2.5 from the second paper, after insertion of (From,To), v - // is affected iff depth(NCD)+1 < depth(v) && a path P from To to v exists - // where every w on P s.t. depth(v) <= depth(w) + // Based on Lemma 2.5 from [2], after insertion of (From,To), v is affected + // iff depth(NCD)+1 < depth(v) && a path P from To to v exists where every + // w on P s.t. depth(v) <= depth(w) // // This reduces to a widest path problem (maximizing the depth of the // minimum vertex in the path) which can be solved by a modified version of - // Dijkstra with a bucket queue (named depth-based search in the paper). + // Dijkstra with a bucket queue (named depth-based search in [2]). // To is in the path, so depth(NCD)+1 < depth(v) <= depth(To). Nothing // affected if this does not hold. @@ -957,7 +959,7 @@ struct SemiNCAInfo { << BlockNamePrinter(ToIDom) << "\n"); // To remains reachable after deletion. - // (Based on the caption under Figure 4. from the second paper.) + // (Based on the caption under Figure 4. from [2].) if (FromTN != ToIDom || HasProperSupport(DT, BUI, ToTN)) DeleteReachable(DT, BUI, FromTN, ToTN); else @@ -976,7 +978,7 @@ struct SemiNCAInfo { LLVM_DEBUG(dbgs() << "\tRebuilding subtree\n"); // Find the top of the subtree that needs to be rebuilt. - // (Based on the lemma 2.6 from the second paper.) + // (Based on the lemma 2.6 from [2].) const NodePtr ToIDom = DT.findNearestCommonDominator(FromTN->getBlock(), ToTN->getBlock()); assert(ToIDom || DT.isPostDominator()); @@ -1008,7 +1010,7 @@ struct SemiNCAInfo { } // Checks if a node has proper support, as defined on the page 3 and later - // explained on the page 7 of the second paper. + // explained on the page 7 of [2]. static bool HasProperSupport(DomTreeT &DT, const BatchUpdatePtr BUI, const TreeNodePtr TN) { LLVM_DEBUG(dbgs() << "IsReachableFromIDom " << BlockNamePrinter(TN) @@ -1033,7 +1035,7 @@ struct SemiNCAInfo { } // Handle deletions that make destination node unreachable. - // (Based on the lemma 2.7 from the second paper.) + // (Based on the lemma 2.7 from the [2].) static void DeleteUnreachable(DomTreeT &DT, const BatchUpdatePtr BUI, const TreeNodePtr ToTN) { LLVM_DEBUG(dbgs() << "Deleting unreachable subtree " @@ -1493,9 +1495,9 @@ struct SemiNCAInfo { // LEFT, and thus, LEFT is really an ancestor (in the dominator tree) of // RIGHT, not a sibling. - // It is possible to verify the parent and sibling properties in - // linear time, but the algorithms are complex. Instead, we do it in a - // straightforward N^2 and N^3 way below, using direct path reachability. + // It is possible to verify the parent and sibling properties in linear time, + // but the algorithms are complex. Instead, we do it in a straightforward + // N^2 and N^3 way below, using direct path reachability. // Checks if the tree has the parent property: if for all edges from V to W in // the input graph, such that V is reachable, the parent of W in the tree is @@ -1571,7 +1573,7 @@ struct SemiNCAInfo { // Check if the given tree is the same as a freshly computed one for the same // Parent. - // Running time: O(N^2), but faster in practise (same as tree construction). + // Running time: O(N^2), but faster in practice (same as tree construction). // // Note that this does not check if that the tree construction algorithm is // correct and should be only used for fast (but possibly unsound) @@ -1648,12 +1650,12 @@ bool Verify(const DomTreeT &DT, typename DomTreeT::VerificationLevel VL) { if (!SNCA.IsSameAsFreshTree(DT)) return false; - // Common checks to verify the properties of the tree. O(N log N) at worst + // Common checks to verify the properties of the tree. O(N log N) at worst. if (!SNCA.verifyRoots(DT) || !SNCA.verifyReachability(DT) || !SNCA.VerifyLevels(DT) || !SNCA.VerifyDFSNumbers(DT)) return false; - // Extra checks depending on VerificationLevel. Up to O(N^3) + // Extra checks depending on VerificationLevel. Up to O(N^3). if (VL == DomTreeT::VerificationLevel::Basic || VL == DomTreeT::VerificationLevel::Full) if (!SNCA.verifyParentProperty(DT)) diff --git a/llvm/include/llvm/Support/Path.h b/llvm/include/llvm/Support/Path.h index 97955f882d51ee..f0b2810cd7a9b9 100644 --- a/llvm/include/llvm/Support/Path.h +++ b/llvm/include/llvm/Support/Path.h @@ -468,10 +468,6 @@ StringRef remove_leading_dotslash(StringRef path, Style style = Style::native); bool remove_dots(SmallVectorImpl &path, bool remove_dot_dot = false, Style style = Style::native); -#if defined(_WIN32) -std::error_code widenPath(const Twine &Path8, SmallVectorImpl &Path16); -#endif - } // end namespace path } // end namespace sys } // end namespace llvm diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h index d800317204223b..5d4d2f812b3f09 100644 --- a/llvm/include/llvm/Support/TypeSize.h +++ b/llvm/include/llvm/Support/TypeSize.h @@ -165,7 +165,7 @@ class TypeSize { // bail out early for scalable vectors and use getFixedSize() // } operator uint64_t() const { -#ifdef STRICT_IMPLICIT_CONVERSION_TYPESIZE +#ifdef STRICT_FIXED_SIZE_VECTORS return getFixedSize(); #else if (isScalable()) diff --git a/llvm/include/llvm/Support/Windows/WindowsSupport.h b/llvm/include/llvm/Support/Windows/WindowsSupport.h index bb7e79b8601805..bd5a90c2c3f00f 100644 --- a/llvm/include/llvm/Support/Windows/WindowsSupport.h +++ b/llvm/include/llvm/Support/Windows/WindowsSupport.h @@ -236,6 +236,12 @@ namespace windows { // UTF-8 regardless of the current code page setting. std::error_code GetCommandLineArguments(SmallVectorImpl &Args, BumpPtrAllocator &Alloc); + +/// Convert UTF-8 path to a suitable UTF-16 path for use with the Win32 Unicode +/// File API. +std::error_code widenPath(const Twine &Path8, SmallVectorImpl &Path16, + size_t MaxPathLen = MAX_PATH); + } // end namespace windows } // end namespace sys } // end namespace llvm. diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 6a34e9f9fc15fe..baa31d0a09e5f5 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -142,7 +142,34 @@ def mul_to_shl : GICombineRule< [{ return Helper.matchCombineMulToShl(*${mi}, ${matchinfo}); }]), (apply [{ Helper.applyCombineMulToShl(*${mi}, ${matchinfo}); }])>; +// [us]itofp(undef) = 0, because the result value is bounded. +def undef_to_fp_zero : GICombineRule< + (defs root:$root), + (match (wip_match_opcode G_UITOFP, G_SITOFP):$root, + [{ return Helper.matchAnyExplicitUseIsUndef(*${root}); }]), + (apply [{ Helper.replaceInstWithFConstant(*${root}, 0.0); }])>; + +def undef_to_int_zero: GICombineRule< + (defs root:$root), + (match (wip_match_opcode G_AND, G_MUL):$root, + [{ return Helper.matchAnyExplicitUseIsUndef(*${root}); }]), + (apply [{ Helper.replaceInstWithConstant(*${root}, 0); }])>; + +def undef_to_negative_one: GICombineRule< + (defs root:$root), + (match (wip_match_opcode G_OR):$root, + [{ return Helper.matchAnyExplicitUseIsUndef(*${root}); }]), + (apply [{ Helper.replaceInstWithConstant(*${root}, -1); }])>; + +def propagate_undef: GICombineRule< + (defs root:$root), + (match (wip_match_opcode G_ADD, G_FPTOSI, G_FPTOUI, G_SUB, G_XOR):$root, + [{ return Helper.matchAnyExplicitUseIsUndef(*${root}); }]), + (apply [{ Helper.replaceInstWithUndef(*${root}); }])>; + +def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero, + undef_to_negative_one, propagate_undef]>; + def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl]>; def all_combines : GICombineGroup<[trivial_combines, ptr_add_immed_chain, - combines_for_extload, combine_indexed_load_store]>; - + combines_for_extload, combine_indexed_load_store, undef_combines]>; diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h index fdf1313e5491bc..c2e14d14d35111 100644 --- a/llvm/include/llvm/Target/TargetMachine.h +++ b/llvm/include/llvm/Target/TargetMachine.h @@ -237,6 +237,9 @@ class TargetMachine { void setSupportsDefaultOutlining(bool Enable) { Options.SupportsDefaultOutlining = Enable; } + void setSupportsDebugEntryValues(bool Enable) { + Options.SupportsDebugEntryValues = Enable; + } bool shouldPrintMachineCode() const { return Options.PrintMachineCode; } diff --git a/llvm/include/llvm/Target/TargetOptions.h b/llvm/include/llvm/Target/TargetOptions.h index 9378e290bed141..7282040a352fa2 100644 --- a/llvm/include/llvm/Target/TargetOptions.h +++ b/llvm/include/llvm/Target/TargetOptions.h @@ -134,8 +134,8 @@ namespace llvm { EmulatedTLS(false), ExplicitEmulatedTLS(false), EnableIPRA(false), EmitStackSizeSection(false), EnableMachineOutliner(false), SupportsDefaultOutlining(false), EmitAddrsig(false), - EmitCallSiteInfo(false), EnableDebugEntryValues(false), - ForceDwarfFrameSection(false) {} + EmitCallSiteInfo(false), SupportsDebugEntryValues(false), + EnableDebugEntryValues(false), ForceDwarfFrameSection(false) {} /// PrintMachineCode - This flag is enabled when the -print-machineinstrs /// option is specified on the command line, and should enable debugging @@ -286,8 +286,16 @@ namespace llvm { /// info, and it is restricted only to optimized code. This can be used for /// something else, so that should be controlled in the frontend. unsigned EmitCallSiteInfo : 1; - /// Emit debug info about parameter's entry values. - unsigned EnableDebugEntryValues : 1; + /// Set if the target supports the debug entry values by default. + unsigned SupportsDebugEntryValues : 1; + /// When set to true, the EnableDebugEntryValues option forces production + /// of debug entry values even if the target does not officially support + /// it. Useful for testing purposes only. This flag should never be checked + /// directly, always use \ref ShouldEmitDebugEntryValues instead. + unsigned EnableDebugEntryValues : 1; + /// NOTE: There are targets that still do not support the debug entry values + /// production. + bool ShouldEmitDebugEntryValues() const; /// Emit DWARF debug frame section. unsigned ForceDwarfFrameSection : 1; diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp index 427e6fd3ace24c..be0feeb9237e92 100644 --- a/llvm/lib/Analysis/MemoryBuiltins.cpp +++ b/llvm/lib/Analysis/MemoryBuiltins.cpp @@ -633,6 +633,10 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitAllocaInst(AllocaInst &I) { if (!I.getAllocatedType()->isSized()) return unknown(); + if (I.getAllocatedType()->isVectorTy() && + I.getAllocatedType()->getVectorIsScalable()) + return unknown(); + APInt Size(IntTyBits, DL.getTypeAllocSize(I.getAllocatedType())); if (!I.isArrayAllocation()) return std::make_pair(align(Size, I.getAlignment()), Zero); diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 48591c54a762d0..7a7be5bd3cb3f1 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -356,7 +356,7 @@ unsigned llvm::ComputeNumSignBits(const Value *V, const DataLayout &DL, } static void computeKnownBitsAddSub(bool Add, const Value *Op0, const Value *Op1, - bool NSW, + bool NSW, const APInt &DemandedElts, KnownBits &KnownOut, KnownBits &Known2, unsigned Depth, const Query &Q) { unsigned BitWidth = KnownOut.getBitWidth(); @@ -364,18 +364,19 @@ static void computeKnownBitsAddSub(bool Add, const Value *Op0, const Value *Op1, // If an initial sequence of bits in the result is not needed, the // corresponding bits in the operands are not needed. KnownBits LHSKnown(BitWidth); - computeKnownBits(Op0, LHSKnown, Depth + 1, Q); - computeKnownBits(Op1, Known2, Depth + 1, Q); + computeKnownBits(Op0, DemandedElts, LHSKnown, Depth + 1, Q); + computeKnownBits(Op1, DemandedElts, Known2, Depth + 1, Q); KnownOut = KnownBits::computeForAddSub(Add, NSW, LHSKnown, Known2); } static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW, - KnownBits &Known, KnownBits &Known2, - unsigned Depth, const Query &Q) { + const APInt &DemandedElts, KnownBits &Known, + KnownBits &Known2, unsigned Depth, + const Query &Q) { unsigned BitWidth = Known.getBitWidth(); - computeKnownBits(Op1, Known, Depth + 1, Q); - computeKnownBits(Op0, Known2, Depth + 1, Q); + computeKnownBits(Op1, DemandedElts, Known, Depth + 1, Q); + computeKnownBits(Op0, DemandedElts, Known2, Depth + 1, Q); bool isKnownNegative = false; bool isKnownNonNegative = false; @@ -1121,15 +1122,15 @@ static void computeKnownBitsFromOperator(const Operator *I, if (!Known.Zero[0] && !Known.One[0] && match(I, m_c_BinOp(m_Value(X), m_Add(m_Deferred(X), m_Value(Y))))) { Known2.resetAll(); - computeKnownBits(Y, Known2, Depth + 1, Q); + computeKnownBits(Y, DemandedElts, Known2, Depth + 1, Q); if (Known2.countMinTrailingOnes() > 0) Known.Zero.setBit(0); } break; } case Instruction::Or: - computeKnownBits(I->getOperand(1), Known, Depth + 1, Q); - computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); + computeKnownBits(I->getOperand(1), DemandedElts, Known, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known2, Depth + 1, Q); // Output known-0 bits are only known if clear in both the LHS & RHS. Known.Zero &= Known2.Zero; @@ -1137,8 +1138,8 @@ static void computeKnownBitsFromOperator(const Operator *I, Known.One |= Known2.One; break; case Instruction::Xor: { - computeKnownBits(I->getOperand(1), Known, Depth + 1, Q); - computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); + computeKnownBits(I->getOperand(1), DemandedElts, Known, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known2, Depth + 1, Q); // Output known-0 bits are known if clear or set in both the LHS & RHS. APInt KnownZeroOut = (Known.Zero & Known2.Zero) | (Known.One & Known2.One); @@ -1149,8 +1150,8 @@ static void computeKnownBitsFromOperator(const Operator *I, } case Instruction::Mul: { bool NSW = Q.IIQ.hasNoSignedWrap(cast(I)); - computeKnownBitsMul(I->getOperand(0), I->getOperand(1), NSW, Known, - Known2, Depth, Q); + computeKnownBitsMul(I->getOperand(0), I->getOperand(1), NSW, DemandedElts, + Known, Known2, Depth, Q); break; } case Instruction::UDiv: { @@ -1336,13 +1337,13 @@ static void computeKnownBitsFromOperator(const Operator *I, case Instruction::Sub: { bool NSW = Q.IIQ.hasNoSignedWrap(cast(I)); computeKnownBitsAddSub(false, I->getOperand(0), I->getOperand(1), NSW, - Known, Known2, Depth, Q); + DemandedElts, Known, Known2, Depth, Q); break; } case Instruction::Add: { bool NSW = Q.IIQ.hasNoSignedWrap(cast(I)); computeKnownBitsAddSub(true, I->getOperand(0), I->getOperand(1), NSW, - Known, Known2, Depth, Q); + DemandedElts, Known, Known2, Depth, Q); break; } case Instruction::SRem: @@ -1605,12 +1606,12 @@ static void computeKnownBitsFromOperator(const Operator *I, switch (II->getIntrinsicID()) { default: break; case Intrinsic::bitreverse: - computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known2, Depth + 1, Q); Known.Zero |= Known2.Zero.reverseBits(); Known.One |= Known2.One.reverseBits(); break; case Intrinsic::bswap: - computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known2, Depth + 1, Q); Known.Zero |= Known2.Zero.byteSwap(); Known.One |= Known2.One.byteSwap(); break; @@ -1762,13 +1763,20 @@ static void computeKnownBitsFromOperator(const Operator *I, } break; } - case Instruction::ExtractElement: - // Look through extract element. At the moment we keep this simple and skip - // tracking the specific element. But at least we might find information - // valid for all elements of the vector (for example if vector is sign - // extended, shifted, etc). - computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); + case Instruction::ExtractElement: { + // Look through extract element. If the index is non-constant or + // out-of-range demand all elements, otherwise just the extracted element. + auto* EEI = cast(I); + const Value* Vec = EEI->getVectorOperand(); + const Value* Idx = EEI->getIndexOperand(); + auto *CIdx = dyn_cast(Idx); + unsigned NumElts = Vec->getType()->getVectorNumElements(); + APInt DemandedVecElts = APInt::getAllOnesValue(NumElts); + if (CIdx && CIdx->getValue().ult(NumElts)) + DemandedVecElts = APInt::getOneBitSet(NumElts, CIdx->getZExtValue()); + computeKnownBits(Vec, DemandedVecElts, Known, Depth + 1, Q); break; + } case Instruction::ExtractValue: if (IntrinsicInst *II = dyn_cast(I->getOperand(0))) { const ExtractValueInst *EVI = cast(I); @@ -1779,19 +1787,19 @@ static void computeKnownBitsFromOperator(const Operator *I, case Intrinsic::uadd_with_overflow: case Intrinsic::sadd_with_overflow: computeKnownBitsAddSub(true, II->getArgOperand(0), - II->getArgOperand(1), false, Known, Known2, - Depth, Q); + II->getArgOperand(1), false, DemandedElts, + Known, Known2, Depth, Q); break; case Intrinsic::usub_with_overflow: case Intrinsic::ssub_with_overflow: computeKnownBitsAddSub(false, II->getArgOperand(0), - II->getArgOperand(1), false, Known, Known2, - Depth, Q); + II->getArgOperand(1), false, DemandedElts, + Known, Known2, Depth, Q); break; case Intrinsic::umul_with_overflow: case Intrinsic::smul_with_overflow: computeKnownBitsMul(II->getArgOperand(0), II->getArgOperand(1), false, - Known, Known2, Depth, Q); + DemandedElts, Known, Known2, Depth, Q); break; } } diff --git a/llvm/lib/CodeGen/Analysis.cpp b/llvm/lib/CodeGen/Analysis.cpp index 1632895fe5fa3f..2a995922654081 100644 --- a/llvm/lib/CodeGen/Analysis.cpp +++ b/llvm/lib/CodeGen/Analysis.cpp @@ -395,7 +395,7 @@ static bool slotOnlyDiscardsData(const Value *RetVal, const Value *CallVal, /// For an aggregate type, determine whether a given index is within bounds or /// not. -static bool indexReallyValid(CompositeType *T, unsigned Idx) { +static bool indexReallyValid(Type *T, unsigned Idx) { if (ArrayType *AT = dyn_cast(T)) return Idx < AT->getNumElements(); @@ -419,7 +419,7 @@ static bool indexReallyValid(CompositeType *T, unsigned Idx) { /// function again on a finished iterator will repeatedly return /// false. SubTypes.back()->getTypeAtIndex(Path.back()) is either an empty /// aggregate or a non-aggregate -static bool advanceToNextLeafType(SmallVectorImpl &SubTypes, +static bool advanceToNextLeafType(SmallVectorImpl &SubTypes, SmallVectorImpl &Path) { // First march back up the tree until we can successfully increment one of the // coordinates in Path. @@ -435,16 +435,16 @@ static bool advanceToNextLeafType(SmallVectorImpl &SubTypes, // We know there's *some* valid leaf now, so march back down the tree picking // out the left-most element at each node. ++Path.back(); - Type *DeeperType = SubTypes.back()->getTypeAtIndex(Path.back()); + Type *DeeperType = + ExtractValueInst::getIndexedType(SubTypes.back(), Path.back()); while (DeeperType->isAggregateType()) { - CompositeType *CT = cast(DeeperType); - if (!indexReallyValid(CT, 0)) + if (!indexReallyValid(DeeperType, 0)) return true; - SubTypes.push_back(CT); + SubTypes.push_back(DeeperType); Path.push_back(0); - DeeperType = CT->getTypeAtIndex(0U); + DeeperType = ExtractValueInst::getIndexedType(DeeperType, 0); } return true; @@ -460,17 +460,15 @@ static bool advanceToNextLeafType(SmallVectorImpl &SubTypes, /// For example, if Next was {[0 x i64], {{}, i32, {}}, i32} then we would setup /// Path as [1, 1] and SubTypes as [Next, {{}, i32, {}}] to represent the first /// i32 in that type. -static bool firstRealType(Type *Next, - SmallVectorImpl &SubTypes, +static bool firstRealType(Type *Next, SmallVectorImpl &SubTypes, SmallVectorImpl &Path) { // First initialise the iterator components to the first "leaf" node // (i.e. node with no valid sub-type at any index, so {} does count as a leaf // despite nominally being an aggregate). - while (Next->isAggregateType() && - indexReallyValid(cast(Next), 0)) { - SubTypes.push_back(cast(Next)); + while (Type *FirstInner = ExtractValueInst::getIndexedType(Next, 0)) { + SubTypes.push_back(Next); Path.push_back(0); - Next = cast(Next)->getTypeAtIndex(0U); + Next = FirstInner; } // If there's no Path now, Next was originally scalar already (or empty @@ -480,7 +478,8 @@ static bool firstRealType(Type *Next, // Otherwise, use normal iteration to keep looking through the tree until we // find a non-aggregate type. - while (SubTypes.back()->getTypeAtIndex(Path.back())->isAggregateType()) { + while (ExtractValueInst::getIndexedType(SubTypes.back(), Path.back()) + ->isAggregateType()) { if (!advanceToNextLeafType(SubTypes, Path)) return false; } @@ -490,14 +489,15 @@ static bool firstRealType(Type *Next, /// Set the iterator data-structures to the next non-empty, non-aggregate /// subtype. -static bool nextRealType(SmallVectorImpl &SubTypes, +static bool nextRealType(SmallVectorImpl &SubTypes, SmallVectorImpl &Path) { do { if (!advanceToNextLeafType(SubTypes, Path)) return false; assert(!Path.empty() && "found a leaf but didn't set the path?"); - } while (SubTypes.back()->getTypeAtIndex(Path.back())->isAggregateType()); + } while (ExtractValueInst::getIndexedType(SubTypes.back(), Path.back()) + ->isAggregateType()); return true; } @@ -669,7 +669,7 @@ bool llvm::returnTypeIsEligibleForTailCall(const Function *F, } SmallVector RetPath, CallPath; - SmallVector RetSubTypes, CallSubTypes; + SmallVector RetSubTypes, CallSubTypes; bool RetEmpty = !firstRealType(RetVal->getType(), RetSubTypes, RetPath); bool CallEmpty = !firstRealType(CallVal->getType(), CallSubTypes, CallPath); @@ -692,7 +692,8 @@ bool llvm::returnTypeIsEligibleForTailCall(const Function *F, // We've exhausted the values produced by the tail call instruction, the // rest are essentially undef. The type doesn't really matter, but we need // *something*. - Type *SlotType = RetSubTypes.back()->getTypeAtIndex(RetPath.back()); + Type *SlotType = + ExtractValueInst::getIndexedType(RetSubTypes.back(), RetPath.back()); CallVal = UndefValue::get(SlotType); } diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 7efeb1a3736ad7..7b469d496b7032 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -95,6 +95,10 @@ static cl::opt UseDwarfRangesBaseAddressSpecifier( "use-dwarf-ranges-base-address-specifier", cl::Hidden, cl::desc("Use base address specifiers in debug_ranges"), cl::init(false)); +static cl::opt EmitDwarfDebugEntryValues( + "emit-debug-entry-values", cl::Hidden, + cl::desc("Emit the debug entry values"), cl::init(false)); + static cl::opt GenerateARangeSection("generate-arange-section", cl::Hidden, cl::desc("Generate dwarf aranges"), @@ -419,6 +423,12 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M) // a monolithic string offsets table without any header. UseSegmentedStringOffsetsTable = DwarfVersion >= 5; + // Emit call-site-param debug info for GDB and LLDB, if the target supports + // the debug entry values feature. It can also be enabled explicitly. + EmitDebugEntryValues = (Asm->TM.Options.ShouldEmitDebugEntryValues() && + (tuneForGDB() || tuneForLLDB())) || + EmitDwarfDebugEntryValues; + Asm->OutStreamer->getContext().setDwarfVersion(DwarfVersion); } @@ -886,9 +896,8 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP, DIE &CallSiteDIE = CU.constructCallSiteEntryDIE(ScopeDIE, CalleeDIE, IsTail, PCAddr, CallReg); - // GDB and LLDB support call site parameter debug info. - if (Asm->TM.Options.EnableDebugEntryValues && - (tuneForGDB() || tuneForLLDB())) { + // Optionally emit call-site-param debug info. + if (emitDebugEntryValues()) { ParamSet Params; // Try to interpret values of call site parameters. collectCallSiteParameters(&MI, Params); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h index a44960589d89fd..882fc739d792ac 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h @@ -386,6 +386,11 @@ class DwarfDebug : public DebugHandlerBase { /// a monolithic sequence of string offsets. bool UseSegmentedStringOffsetsTable; + /// Enable production of call site parameters needed to print the debug entry + /// values. Useful for testing purposes when a debugger does not support the + /// feature yet. + bool EmitDebugEntryValues; + /// Separated Dwarf Variables /// In general these will all be for bits that are left in the /// original object file, rather than things that are meant @@ -708,6 +713,10 @@ class DwarfDebug : public DebugHandlerBase { return UseSegmentedStringOffsetsTable; } + bool emitDebugEntryValues() const { + return EmitDebugEntryValues; + } + bool shareAcrossDWOCUs() const; /// Returns the Dwarf Version. diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp index 7acb84df582fa0..d5dc49a91177bb 100644 --- a/llvm/lib/CodeGen/CommandFlags.cpp +++ b/llvm/lib/CodeGen/CommandFlags.cpp @@ -380,7 +380,7 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() { static cl::opt EnableDebugEntryValues( "debug-entry-values", - cl::desc("Emit debug info about parameter's entry values"), + cl::desc("Enable debug info for the debug entry values."), cl::init(false)); CGBINDOPT(EnableDebugEntryValues); diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 4e0daadead30d0..ecb46f401fb442 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1483,6 +1483,37 @@ bool CombinerHelper::tryCombineShiftToUnmerge(MachineInstr &MI, return false; } +bool CombinerHelper::matchAnyExplicitUseIsUndef(MachineInstr &MI) { + return any_of(MI.explicit_uses(), [this](const MachineOperand &MO) { + return MO.isReg() && + getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MO.getReg(), MRI); + }); +} + +bool CombinerHelper::replaceInstWithFConstant(MachineInstr &MI, double C) { + assert(MI.getNumDefs() == 1 && "Expected only one def?"); + Builder.setInstr(MI); + Builder.buildFConstant(MI.getOperand(0), C); + MI.eraseFromParent(); + return true; +} + +bool CombinerHelper::replaceInstWithConstant(MachineInstr &MI, int64_t C) { + assert(MI.getNumDefs() == 1 && "Expected only one def?"); + Builder.setInstr(MI); + Builder.buildConstant(MI.getOperand(0), C); + MI.eraseFromParent(); + return true; +} + +bool CombinerHelper::replaceInstWithUndef(MachineInstr &MI) { + assert(MI.getNumDefs() == 1 && "Expected only one def?"); + Builder.setInstr(MI); + Builder.buildUndef(MI.getOperand(0)); + MI.eraseFromParent(); + return true; +} + bool CombinerHelper::tryCombine(MachineInstr &MI) { if (tryCombineCopy(MI)) return true; diff --git a/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp b/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp index fc9c802693abdc..54e5d48edf2761 100644 --- a/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp +++ b/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp @@ -19,11 +19,12 @@ using namespace llvm; const unsigned RegisterBank::InvalidID = UINT_MAX; +const unsigned RegisterBank::InvalidHwMode = UINT_MAX; RegisterBank::RegisterBank( - unsigned ID, const char *Name, unsigned Size, + unsigned ID, const char *Name, const unsigned *Sizes, const uint32_t *CoveredClasses, unsigned NumRegClasses) - : ID(ID), Name(Name), Size(Size) { + : ID(ID), Name(Name), Sizes(Sizes), HwMode(InvalidHwMode) { ContainedRegClasses.resize(NumRegClasses); ContainedRegClasses.setBitsInMask(CoveredClasses); } @@ -63,7 +64,8 @@ bool RegisterBank::covers(const TargetRegisterClass &RC) const { } bool RegisterBank::isValid() const { - return ID != InvalidID && Name != nullptr && Size != 0 && + return ID != InvalidID && Name != nullptr && Sizes != nullptr && + HwMode != InvalidID && // A register bank that does not cover anything is useless. !ContainedRegClasses.empty(); } diff --git a/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp index 255ea693b5c4ab..3a8d0a9d3c4fc9 100644 --- a/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp +++ b/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp @@ -56,8 +56,11 @@ const unsigned RegisterBankInfo::InvalidMappingID = UINT_MAX - 1; // RegisterBankInfo implementation. //------------------------------------------------------------------------------ RegisterBankInfo::RegisterBankInfo(RegisterBank **RegBanks, - unsigned NumRegBanks) + unsigned NumRegBanks, unsigned HwMode) : RegBanks(RegBanks), NumRegBanks(NumRegBanks) { + // Initialize HwMode for all RegBanks + for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) + RegBanks[Idx]->HwMode = HwMode; #ifndef NDEBUG for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) { assert(RegBanks[Idx] != nullptr && "Invalid RegisterBank"); diff --git a/llvm/lib/CodeGen/LiveDebugValues.cpp b/llvm/lib/CodeGen/LiveDebugValues.cpp index 94c5cc58ac1e36..a013c419b7c790 100644 --- a/llvm/lib/CodeGen/LiveDebugValues.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues.cpp @@ -1125,7 +1125,7 @@ void LiveDebugValues::transferRegisterDef( if (auto *TPC = getAnalysisIfAvailable()) { auto &TM = TPC->getTM(); - if (TM.Options.EnableDebugEntryValues) + if (TM.Options.ShouldEmitDebugEntryValues()) emitEntryValues(MI, OpenRanges, VarLocIDs, Transfers, KillSet); } } @@ -1630,7 +1630,7 @@ void LiveDebugValues::recordEntryValue(const MachineInstr &MI, VarLocMap &VarLocIDs) { if (auto *TPC = getAnalysisIfAvailable()) { auto &TM = TPC->getTM(); - if (!TM.Options.EnableDebugEntryValues) + if (!TM.Options.ShouldEmitDebugEntryValues()) return; } diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp index 19bcb09530f7c1..d63f194f7e6620 100644 --- a/llvm/lib/CodeGen/MachineOutliner.cpp +++ b/llvm/lib/CodeGen/MachineOutliner.cpp @@ -97,6 +97,13 @@ static cl::opt EnableLinkOnceODROutlining( cl::desc("Enable the machine outliner on linkonceodr functions"), cl::init(false)); +// Set the number of times to repeatedly apply outlining. +// Defaults to 1, but more repetitions can save additional size. +static cl::opt + NumRepeat("machine-outline-runs", cl::Hidden, + cl::desc("The number of times to apply machine outlining"), + cl::init(1)); + namespace { /// Represents an undefined index in the suffix tree. @@ -842,6 +849,9 @@ struct MachineOutliner : public ModulePass { /// linkonceodr linkage. bool OutlineFromLinkOnceODRs = false; + /// The current repeat number of machine outlining. + unsigned OutlineRepeatedNum = 0; + /// Set to true if the outliner should run on all functions in the module /// considered safe for outlining. /// Set to true by default for compatibility with llc's -run-pass option. @@ -900,9 +910,12 @@ struct MachineOutliner : public ModulePass { InstructionMapper &Mapper, unsigned Name); - /// Calls 'doOutline()'. + /// Calls runOnceOnModule NumRepeat times bool runOnModule(Module &M) override; + /// Calls 'doOutline()'. + bool runOnceOnModule(Module &M, unsigned Iter); + /// Construct a suffix tree on the instructions in \p M and outline repeated /// strings from that tree. bool doOutline(Module &M, unsigned &OutlinedFunctionNum); @@ -1099,7 +1112,13 @@ MachineFunction *MachineOutliner::createOutlinedFunction( // Create the function name. This should be unique. // FIXME: We should have a better naming scheme. This should be stable, // regardless of changes to the outliner's cost model/traversal order. - std::string FunctionName = ("OUTLINED_FUNCTION_" + Twine(Name)).str(); + std::string FunctionName; + if (OutlineRepeatedNum > 0) + FunctionName = ("OUTLINED_FUNCTION_" + Twine(OutlineRepeatedNum + 1) + "_" + + Twine(Name)) + .str(); + else + FunctionName = ("OUTLINED_FUNCTION_" + Twine(Name)).str(); // Create the function using an IR-level function. LLVMContext &C = M.getContext(); @@ -1438,12 +1457,14 @@ void MachineOutliner::emitInstrCountChangedRemark( } } -bool MachineOutliner::runOnModule(Module &M) { +bool MachineOutliner::runOnceOnModule(Module &M, unsigned Iter) { // Check if there's anything in the module. If it's empty, then there's // nothing to outline. if (M.empty()) return false; + OutlineRepeatedNum = Iter; + // Number to append to the current outlined function. unsigned OutlinedFunctionNum = 0; @@ -1507,3 +1528,23 @@ bool MachineOutliner::doOutline(Module &M, unsigned &OutlinedFunctionNum) { return OutlinedSomething; } + +// Apply machine outlining for NumRepeat times. +bool MachineOutliner::runOnModule(Module &M) { + if (NumRepeat < 1) + report_fatal_error("Expect NumRepeat for machine outlining " + "to be greater than or equal to 1!\n"); + + bool Changed = false; + for (unsigned I = 0; I < NumRepeat; I++) { + if (!runOnceOnModule(M, I)) { + LLVM_DEBUG(dbgs() << "Stopped outlining at iteration " << I + << " because no changes were found.\n";); + return Changed; + } + Changed = true; + } + LLVM_DEBUG(dbgs() << "Stopped outlining because iteration is " + "equal to " << NumRepeat << "\n";); + return Changed; +} diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp index a2ca8bcf43ac11..b12ccc40eb61f0 100644 --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -812,7 +812,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) { for (unsigned i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) if (!MFI.isDeadObjectIndex(i) && MFI.getStackID(i) == TargetStackID::Default) - assert(MFI.getObjectAlignment(i) <= MFI.getMaxAlignment() && + assert(MFI.getObjectAlign(i) <= MFI.getMaxAlign() && "MaxAlignment is invalid"); #endif diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 5a4b4c615bc06a..09934bbb29fe03 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -20,10 +20,11 @@ //===----------------------------------------------------------------------===// #include "LegalizeTypes.h" +#include "llvm/Analysis/MemoryLocation.h" #include "llvm/IR/DataLayout.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Support/TypeSize.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; #define DEBUG_TYPE "legalize-types" @@ -1627,11 +1628,6 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT, std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); } - EVT MemoryVT = MGT->getMemoryVT(); - EVT LoMemVT, HiMemVT; - // Split MemoryVT - std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); - SDValue PassThruLo, PassThruHi; if (getTypeAction(PassThru.getValueType()) == TargetLowering::TypeSplitVector) GetSplitVector(PassThru, PassThruLo, PassThruHi); @@ -1644,10 +1640,10 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT, else std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl); - MachineMemOperand *MMO = DAG.getMachineFunction(). - getMachineMemOperand(MGT->getPointerInfo(), - MachineMemOperand::MOLoad, LoMemVT.getStoreSize(), - Alignment, MGT->getAAInfo(), MGT->getRanges()); + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + MGT->getPointerInfo(), MachineMemOperand::MOLoad, + MemoryLocation::UnknownSize, Alignment, MGT->getAAInfo(), + MGT->getRanges()); SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale}; Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl, OpsLo, @@ -2376,13 +2372,10 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N, SDValue Index = N->getIndex(); SDValue Scale = N->getScale(); SDValue Data = N->getValue(); - EVT MemoryVT = N->getMemoryVT(); unsigned Alignment = N->getOriginalAlignment(); SDLoc DL(N); // Split all operands - EVT LoMemVT, HiMemVT; - std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); SDValue DataLo, DataHi; if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector) @@ -2409,20 +2402,14 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N, std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL); SDValue Lo; - MachineMemOperand *MMO = DAG.getMachineFunction(). - getMachineMemOperand(N->getPointerInfo(), - MachineMemOperand::MOStore, LoMemVT.getStoreSize(), - Alignment, N->getAAInfo(), N->getRanges()); + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + N->getPointerInfo(), MachineMemOperand::MOStore, + MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges()); SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo, Scale}; Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(), DL, OpsLo, MMO, N->getIndexType()); - MMO = DAG.getMachineFunction(). - getMachineMemOperand(N->getPointerInfo(), - MachineMemOperand::MOStore, HiMemVT.getStoreSize(), - Alignment, N->getAAInfo(), N->getRanges()); - // The order of the Scatter operation after split is well defined. The "Hi" // part comes after the "Lo". So these two operations should be chained one // after another. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index dcd072d7631f5a..5b673486af1532 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -7435,7 +7435,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, } SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, - ArrayRef Ops) { + ArrayRef Ops, const SDNodeFlags Flags) { if (VTList.NumVTs == 1) return getNode(Opcode, DL, VTList.VTs[0], Ops); @@ -7504,6 +7504,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, return SDValue(E, 0); N = newSDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList); + N->setFlags(Flags); createOperands(N, Ops); CSEMap.InsertNode(N, IP); } else { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index e124550eb0a516..3f869432319832 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -415,10 +415,13 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL, // Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the // intermediate operands. EVT BuiltVectorTy = - EVT::getVectorVT(*DAG.getContext(), IntermediateVT.getScalarType(), - (IntermediateVT.isVector() - ? IntermediateVT.getVectorNumElements() * NumParts - : NumIntermediates)); + IntermediateVT.isVector() + ? EVT::getVectorVT( + *DAG.getContext(), IntermediateVT.getScalarType(), + IntermediateVT.getVectorElementCount() * NumParts) + : EVT::getVectorVT(*DAG.getContext(), + IntermediateVT.getScalarType(), + NumIntermediates); Val = DAG.getNode(IntermediateVT.isVector() ? ISD::CONCAT_VECTORS : ISD::BUILD_VECTOR, DL, BuiltVectorTy, Ops); @@ -1113,29 +1116,23 @@ void SelectionDAGBuilder::visit(const Instruction &I) { visit(I.getOpcode(), I); if (auto *FPMO = dyn_cast(&I)) { - // Propagate the fast-math-flags of this IR instruction to the DAG node that - // maps to this instruction. - // TODO: We could handle all flags (nsw, etc) here. - // TODO: If an IR instruction maps to >1 node, only the final node will have - // flags set. - if (SDNode *Node = getNodeForIRValue(&I)) { - SDNodeFlags IncomingFlags; - IncomingFlags.copyFMF(*FPMO); - if (!Node->getFlags().isDefined()) - Node->setFlags(IncomingFlags); - else - Node->intersectFlagsWith(IncomingFlags); - } - } - // Constrained FP intrinsics with fpexcept.ignore should also get - // the NoFPExcept flag. - if (auto *FPI = dyn_cast(&I)) - if (FPI->getExceptionBehavior() == fp::ExceptionBehavior::ebIgnore) + // ConstrainedFPIntrinsics handle their own FMF. + if (!isa(&I)) { + // Propagate the fast-math-flags of this IR instruction to the DAG node that + // maps to this instruction. + // TODO: We could handle all flags (nsw, etc) here. + // TODO: If an IR instruction maps to >1 node, only the final node will have + // flags set. if (SDNode *Node = getNodeForIRValue(&I)) { - SDNodeFlags Flags = Node->getFlags(); - Flags.setNoFPExcept(true); - Node->setFlags(Flags); + SDNodeFlags IncomingFlags; + IncomingFlags.copyFMF(*FPMO); + if (!Node->getFlags().isDefined()) + Node->setFlags(IncomingFlags); + else + Node->intersectFlagsWith(IncomingFlags); } + } + } if (!I.isTerminator() && !HasTailCall && !isStatepoint(&I)) // statepoints handle their exports internally @@ -7064,6 +7061,13 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic( SDVTList VTs = DAG.getVTList(ValueVTs); fp::ExceptionBehavior EB = FPI.getExceptionBehavior().getValue(); + SDNodeFlags Flags; + if (EB == fp::ExceptionBehavior::ebIgnore) + Flags.setNoFPExcept(true); + + if (auto *FPOp = dyn_cast(&FPI)) + Flags.copyFMF(*FPOp); + unsigned Opcode; switch (FPI.getIntrinsicID()) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. @@ -7079,7 +7083,7 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic( !TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), ValueVTs[0])) { Opers.pop_back(); - SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, sdl, VTs, Opers); + SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, sdl, VTs, Opers, Flags); pushOutChain(Mul, EB); Opcode = ISD::STRICT_FADD; Opers.clear(); @@ -7107,7 +7111,7 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic( } } - SDValue Result = DAG.getNode(Opcode, sdl, VTs, Opers); + SDValue Result = DAG.getNode(Opcode, sdl, VTs, Opers, Flags); pushOutChain(Result, EB); SDValue FPResult = Result.getValue(0); diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 6148b24e3e0005..70d8656d2875e0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -5695,8 +5695,7 @@ TargetLowering::getNegatibleCost(SDValue Op, SelectionDAG &DAG, } SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, - bool LegalOperations, - bool ForCodeSize, + bool LegalOps, bool OptForSize, unsigned Depth) const { // fneg is removable even if it has multiple uses. if (Op.getOpcode() == ISD::FNEG) @@ -5704,13 +5703,19 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, assert(Depth <= SelectionDAG::MaxRecursionDepth && "getNegatedExpression doesn't match getNegatibleCost"); + + // Pre-increment recursion depth for use in recursive calls. + ++Depth; const SDNodeFlags Flags = Op->getFlags(); + EVT VT = Op.getValueType(); + unsigned Opcode = Op.getOpcode(); + SDLoc DL(Op); - switch (Op.getOpcode()) { + switch (Opcode) { case ISD::ConstantFP: { APFloat V = cast(Op)->getValueAPF(); V.changeSign(); - return DAG.getConstantFP(V, SDLoc(Op), Op.getValueType()); + return DAG.getConstantFP(V, DL, VT); } case ISD::BUILD_VECTOR: { SmallVector Ops; @@ -5721,60 +5726,52 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, } APFloat V = cast(C)->getValueAPF(); V.changeSign(); - Ops.push_back(DAG.getConstantFP(V, SDLoc(Op), C.getValueType())); + Ops.push_back(DAG.getConstantFP(V, DL, C.getValueType())); } - return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Ops); + return DAG.getBuildVector(VT, DL, Ops); } case ISD::FADD: { + SDValue X = Op.getOperand(0), Y = Op.getOperand(1); assert((DAG.getTarget().Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) && "Expected NSZ fp-flag"); - // fold (fneg (fadd A, B)) -> (fsub (fneg A), B) - NegatibleCost V0 = getNegatibleCost(Op.getOperand(0), DAG, LegalOperations, - ForCodeSize, Depth + 1); - if (V0 != NegatibleCost::Expensive) - return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), - getNegatedExpression(Op.getOperand(0), DAG, - LegalOperations, ForCodeSize, - Depth + 1), - Op.getOperand(1), Flags); - // fold (fneg (fadd A, B)) -> (fsub (fneg B), A) - return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), - getNegatedExpression(Op.getOperand(1), DAG, - LegalOperations, ForCodeSize, - Depth + 1), - Op.getOperand(0), Flags); - } - case ISD::FSUB: - // fold (fneg (fsub 0, B)) -> B - if (ConstantFPSDNode *N0CFP = - isConstOrConstSplatFP(Op.getOperand(0), /*AllowUndefs*/ true)) - if (N0CFP->isZero()) - return Op.getOperand(1); + // fold (fneg (fadd X, Y)) -> (fsub (fneg X), Y) + NegatibleCost CostX = getNegatibleCost(X, DAG, LegalOps, OptForSize, Depth); + if (CostX != NegatibleCost::Expensive) + return DAG.getNode( + ISD::FSUB, DL, VT, + getNegatedExpression(X, DAG, LegalOps, OptForSize, Depth), Y, Flags); - // fold (fneg (fsub A, B)) -> (fsub B, A) - return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), - Op.getOperand(1), Op.getOperand(0), Flags); + // fold (fneg (fadd X, Y)) -> (fsub (fneg Y), X) + return DAG.getNode( + ISD::FSUB, DL, VT, + getNegatedExpression(Y, DAG, LegalOps, OptForSize, Depth), X, Flags); + } + case ISD::FSUB: { + SDValue X = Op.getOperand(0), Y = Op.getOperand(1); + // fold (fneg (fsub 0, Y)) -> Y + if (ConstantFPSDNode *C = isConstOrConstSplatFP(X, /*AllowUndefs*/ true)) + if (C->isZero()) + return Y; + // fold (fneg (fsub X, Y)) -> (fsub Y, X) + return DAG.getNode(ISD::FSUB, DL, VT, Y, X, Flags); + } case ISD::FMUL: case ISD::FDIV: { + SDValue X = Op.getOperand(0), Y = Op.getOperand(1); // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) - NegatibleCost V0 = getNegatibleCost(Op.getOperand(0), DAG, LegalOperations, - ForCodeSize, Depth + 1); - if (V0 != NegatibleCost::Expensive) - return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), - getNegatedExpression(Op.getOperand(0), DAG, - LegalOperations, ForCodeSize, - Depth + 1), - Op.getOperand(1), Flags); + NegatibleCost CostX = getNegatibleCost(X, DAG, LegalOps, OptForSize, Depth); + if (CostX != NegatibleCost::Expensive) + return DAG.getNode( + Opcode, DL, VT, + getNegatedExpression(X, DAG, LegalOps, OptForSize, Depth), Y, Flags); // fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y)) return DAG.getNode( - Op.getOpcode(), SDLoc(Op), Op.getValueType(), Op.getOperand(0), - getNegatedExpression(Op.getOperand(1), DAG, LegalOperations, - ForCodeSize, Depth + 1), - Flags); + Opcode, DL, VT, X, + getNegatedExpression(Y, DAG, LegalOps, OptForSize, Depth), Flags); } case ISD::FMA: case ISD::FMAD: { @@ -5782,39 +5779,30 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, Flags.hasNoSignedZeros()) && "Expected NSZ fp-flag"); - SDValue Neg2 = getNegatedExpression(Op.getOperand(2), DAG, LegalOperations, - ForCodeSize, Depth + 1); - - NegatibleCost V0 = getNegatibleCost(Op.getOperand(0), DAG, LegalOperations, - ForCodeSize, Depth + 1); - NegatibleCost V1 = getNegatibleCost(Op.getOperand(1), DAG, LegalOperations, - ForCodeSize, Depth + 1); - if (V0 > V1) { + SDValue X = Op.getOperand(0), Y = Op.getOperand(1), Z = Op.getOperand(2); + SDValue NegZ = getNegatedExpression(Z, DAG, LegalOps, OptForSize, Depth); + NegatibleCost CostX = getNegatibleCost(X, DAG, LegalOps, OptForSize, Depth); + NegatibleCost CostY = getNegatibleCost(Y, DAG, LegalOps, OptForSize, Depth); + if (CostX > CostY) { // fold (fneg (fma X, Y, Z)) -> (fma (fneg X), Y, (fneg Z)) - SDValue Neg0 = getNegatedExpression( - Op.getOperand(0), DAG, LegalOperations, ForCodeSize, Depth + 1); - return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), Neg0, - Op.getOperand(1), Neg2, Flags); + SDValue NegX = getNegatedExpression(X, DAG, LegalOps, OptForSize, Depth); + return DAG.getNode(Opcode, DL, VT, NegX, Y, NegZ, Flags); } // fold (fneg (fma X, Y, Z)) -> (fma X, (fneg Y), (fneg Z)) - SDValue Neg1 = getNegatedExpression(Op.getOperand(1), DAG, LegalOperations, - ForCodeSize, Depth + 1); - return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), - Op.getOperand(0), Neg1, Neg2, Flags); + SDValue NegY = getNegatedExpression(Y, DAG, LegalOps, OptForSize, Depth); + return DAG.getNode(Opcode, DL, VT, X, NegY, NegZ, Flags); } case ISD::FP_EXTEND: case ISD::FSIN: - return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), - getNegatedExpression(Op.getOperand(0), DAG, - LegalOperations, ForCodeSize, - Depth + 1)); + return DAG.getNode(Opcode, DL, VT, + getNegatedExpression(Op.getOperand(0), DAG, LegalOps, + OptForSize, Depth)); case ISD::FP_ROUND: - return DAG.getNode(ISD::FP_ROUND, SDLoc(Op), Op.getValueType(), - getNegatedExpression(Op.getOperand(0), DAG, - LegalOperations, ForCodeSize, - Depth + 1), + return DAG.getNode(ISD::FP_ROUND, DL, VT, + getNegatedExpression(Op.getOperand(0), DAG, LegalOps, + OptForSize, Depth), Op.getOperand(1)); } diff --git a/llvm/lib/CodeGen/TargetOptionsImpl.cpp b/llvm/lib/CodeGen/TargetOptionsImpl.cpp index d794a261ecb2f8..4866d4c171c0ee 100644 --- a/llvm/lib/CodeGen/TargetOptionsImpl.cpp +++ b/llvm/lib/CodeGen/TargetOptionsImpl.cpp @@ -45,3 +45,9 @@ bool TargetOptions::DisableFramePointerElim(const MachineFunction &MF) const { bool TargetOptions::HonorSignDependentRoundingFPMath() const { return !UnsafeFPMath && HonorSignDependentRoundingFPMathOption; } + +/// NOTE: There are targets that still do not support the debug entry values +/// production. +bool TargetOptions::ShouldEmitDebugEntryValues() const { + return SupportsDebugEntryValues || EnableDebugEntryValues; +} diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp index 264982983fc842..f93c4b87729b63 100644 --- a/llvm/lib/CodeGen/ValueTypes.cpp +++ b/llvm/lib/CodeGen/ValueTypes.cpp @@ -22,7 +22,8 @@ EVT EVT::changeExtendedTypeToInteger() const { EVT EVT::changeExtendedVectorElementTypeToInteger() const { LLVMContext &Context = LLVMTy->getContext(); EVT IntTy = getIntegerVT(Context, getScalarSizeInBits()); - return getVectorVT(Context, IntTy, getVectorNumElements()); + return getVectorVT(Context, IntTy, getVectorNumElements(), + isScalableVector()); } EVT EVT::getExtendedIntegerVT(LLVMContext &Context, unsigned BitWidth) { @@ -32,10 +33,19 @@ EVT EVT::getExtendedIntegerVT(LLVMContext &Context, unsigned BitWidth) { return VT; } -EVT EVT::getExtendedVectorVT(LLVMContext &Context, EVT VT, - unsigned NumElements) { +EVT EVT::getExtendedVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, + bool IsScalable) { EVT ResultVT; - ResultVT.LLVMTy = VectorType::get(VT.getTypeForEVT(Context), NumElements); + ResultVT.LLVMTy = + VectorType::get(VT.getTypeForEVT(Context), NumElements, IsScalable); + assert(ResultVT.isExtended() && "Type is not extended!"); + return ResultVT; +} + +EVT EVT::getExtendedVectorVT(LLVMContext &Context, EVT VT, ElementCount EC) { + EVT ResultVT; + ResultVT.LLVMTy = + VectorType::get(VT.getTypeForEVT(Context), {EC.Min, EC.Scalable}); assert(ResultVT.isExtended() && "Type is not extended!"); return ResultVT; } @@ -92,6 +102,14 @@ bool EVT::isExtended2048BitVector() const { return isExtendedVector() && getExtendedSizeInBits() == 2048; } +bool EVT::isExtendedFixedLengthVector() const { + return isExtendedVector() && !cast(LLVMTy)->isScalable(); +} + +bool EVT::isExtendedScalableVector() const { + return isExtendedVector() && cast(LLVMTy)->isScalable(); +} + EVT EVT::getExtendedVectorElementType() const { assert(isExtended() && "Type is not extended!"); return EVT::getEVT(cast(LLVMTy)->getElementType()); @@ -102,6 +120,11 @@ unsigned EVT::getExtendedVectorNumElements() const { return cast(LLVMTy)->getNumElements(); } +ElementCount EVT::getExtendedVectorElementCount() const { + assert(isExtended() && "Type is not extended!"); + return cast(LLVMTy)->getElementCount(); +} + TypeSize EVT::getExtendedSizeInBits() const { assert(isExtended() && "Type is not extended!"); if (IntegerType *ITy = dyn_cast(LLVMTy)) diff --git a/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp b/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp index 3905ce9bf5aca1..a8e88a9785c591 100644 --- a/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp +++ b/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp @@ -165,10 +165,12 @@ void CompileOnDemandLayer::emit(MaterializationResponsibility R, return; } - R.replace(reexports(PDR.getImplDylib(), std::move(NonCallables), - JITDylibLookupFlags::MatchAllSymbols)); - R.replace(lazyReexports(LCTMgr, PDR.getISManager(), PDR.getImplDylib(), - std::move(Callables), AliaseeImpls)); + if (!NonCallables.empty()) + R.replace(reexports(PDR.getImplDylib(), std::move(NonCallables), + JITDylibLookupFlags::MatchAllSymbols)); + if (!Callables.empty()) + R.replace(lazyReexports(LCTMgr, PDR.getISManager(), PDR.getImplDylib(), + std::move(Callables), AliaseeImpls)); } CompileOnDemandLayer::PerDylibResources & diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp index c651fe68cb1551..640f040b700a97 100644 --- a/llvm/lib/ExecutionEngine/Orc/Core.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp @@ -516,8 +516,15 @@ void MaterializationResponsibility::failMaterialization() { void MaterializationResponsibility::replace( std::unique_ptr MU) { - for (auto &KV : MU->getSymbols()) + // If the replacement MU is empty then return. + if (MU->getSymbols().empty()) + return; + + for (auto &KV : MU->getSymbols()) { + assert(SymbolFlags.count(KV.first) && + "Replacing definition outside this responsibility set"); SymbolFlags.erase(KV.first); + } if (MU->getInitializerSymbol() == InitSymbol) InitSymbol = nullptr; @@ -934,7 +941,11 @@ void JITDylib::replace(std::unique_ptr MU) { "Unexpected materializer entry in map"); SymI->second.setAddress(SymI->second.getAddress()); SymI->second.setMaterializerAttached(true); - UnmaterializedInfos[KV.first] = UMI; + + auto &UMIEntry = UnmaterializedInfos[KV.first]; + assert((!UMIEntry || !UMIEntry->MU) && + "Replacing symbol with materializer still attached"); + UMIEntry = UMI; } return nullptr; diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp index 63a5b1f09c821f..9451a572549316 100644 --- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp @@ -175,7 +175,6 @@ class GenericLLVMIRPlatformSupport : public LLJIT::PlatformSupport { } Error notifyAdding(JITDylib &JD, const MaterializationUnit &MU) { - std::lock_guard Lock(PlatformSupportMutex); if (auto &InitSym = MU.getInitializerSymbol()) InitSymbols[&JD].add(InitSym); else { @@ -236,11 +235,13 @@ class GenericLLVMIRPlatformSupport : public LLJIT::PlatformSupport { } void registerInitFunc(JITDylib &JD, SymbolStringPtr InitName) { - std::lock_guard Lock(PlatformSupportMutex); - InitFunctions[&JD].add(InitName); + getExecutionSession().runSessionLocked([&]() { + InitFunctions[&JD].add(InitName); + }); } private: + Expected> getInitializers(JITDylib &JD) { if (auto Err = issueInitLookups(JD)) return std::move(Err); @@ -248,18 +249,17 @@ class GenericLLVMIRPlatformSupport : public LLJIT::PlatformSupport { DenseMap LookupSymbols; std::vector DFSLinkOrder; - { - std::lock_guard Lock(PlatformSupportMutex); - DFSLinkOrder = getDFSLinkOrder(JD); + getExecutionSession().runSessionLocked([&]() { + DFSLinkOrder = getDFSLinkOrder(JD); - for (auto *NextJD : DFSLinkOrder) { - auto IFItr = InitFunctions.find(NextJD); - if (IFItr != InitFunctions.end()) { - LookupSymbols[NextJD] = std::move(IFItr->second); - InitFunctions.erase(IFItr); + for (auto *NextJD : DFSLinkOrder) { + auto IFItr = InitFunctions.find(NextJD); + if (IFItr != InitFunctions.end()) { + LookupSymbols[NextJD] = std::move(IFItr->second); + InitFunctions.erase(IFItr); + } } - } - } + }); LLVM_DEBUG({ dbgs() << "JITDylib init order is [ "; @@ -300,21 +300,20 @@ class GenericLLVMIRPlatformSupport : public LLJIT::PlatformSupport { DenseMap LookupSymbols; std::vector DFSLinkOrder; - { - std::lock_guard Lock(PlatformSupportMutex); - DFSLinkOrder = getDFSLinkOrder(JD); - - for (auto *NextJD : DFSLinkOrder) { - auto &JDLookupSymbols = LookupSymbols[NextJD]; - auto DIFItr = DeInitFunctions.find(NextJD); - if (DIFItr != DeInitFunctions.end()) { - LookupSymbols[NextJD] = std::move(DIFItr->second); - DeInitFunctions.erase(DIFItr); - } - JDLookupSymbols.add(LLJITRunAtExits, + ES.runSessionLocked([&]() { + DFSLinkOrder = getDFSLinkOrder(JD); + + for (auto *NextJD : DFSLinkOrder) { + auto &JDLookupSymbols = LookupSymbols[NextJD]; + auto DIFItr = DeInitFunctions.find(NextJD); + if (DIFItr != DeInitFunctions.end()) { + LookupSymbols[NextJD] = std::move(DIFItr->second); + DeInitFunctions.erase(DIFItr); + } + JDLookupSymbols.add(LLJITRunAtExits, SymbolLookupFlags::WeaklyReferencedSymbol); } - } + }); auto LookupResult = Platform::lookupInitSymbols(ES, LookupSymbols); @@ -366,20 +365,19 @@ class GenericLLVMIRPlatformSupport : public LLJIT::PlatformSupport { /// JITDylibs that it depends on). Error issueInitLookups(JITDylib &JD) { DenseMap RequiredInitSymbols; + std::vector DFSLinkOrder; - { - std::lock_guard Lock(PlatformSupportMutex); - - auto DFSLinkOrder = getDFSLinkOrder(JD); + getExecutionSession().runSessionLocked([&]() { + DFSLinkOrder = getDFSLinkOrder(JD); - for (auto *NextJD : DFSLinkOrder) { - auto ISItr = InitSymbols.find(NextJD); - if (ISItr != InitSymbols.end()) { - RequiredInitSymbols[NextJD] = std::move(ISItr->second); - InitSymbols.erase(ISItr); + for (auto *NextJD : DFSLinkOrder) { + auto ISItr = InitSymbols.find(NextJD); + if (ISItr != InitSymbols.end()) { + RequiredInitSymbols[NextJD] = std::move(ISItr->second); + InitSymbols.erase(ISItr); + } } - } - } + }); return Platform::lookupInitSymbols(getExecutionSession(), RequiredInitSymbols) @@ -435,7 +433,6 @@ class GenericLLVMIRPlatformSupport : public LLJIT::PlatformSupport { return ThreadSafeModule(std::move(M), std::move(Ctx)); } - std::mutex PlatformSupportMutex; LLJIT &J; SymbolStringPtr InitFunctionPrefix; DenseMap InitSymbols; diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp index 9a836677ef15b5..cae0271969c6be 100644 --- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp @@ -163,7 +163,6 @@ Error MachOPlatform::notifyAdding(JITDylib &JD, const MaterializationUnit &MU) { if (!InitSym) return Error::success(); - std::lock_guard Lock(PlatformMutex); RegisteredInitSymbols[&JD].add(InitSym); LLVM_DEBUG({ dbgs() << "MachOPlatform: Registered init symbol " << *InitSym << " for MU " @@ -187,11 +186,10 @@ MachOPlatform::getInitializerSequence(JITDylib &JD) { std::vector DFSLinkOrder; while (true) { - // Lock the platform while we search for any initializer symbols to - // look up. + DenseMap NewInitSymbols; - { - std::lock_guard Lock(PlatformMutex); + + ES.runSessionLocked([&]() { DFSLinkOrder = getDFSLinkOrder(JD); for (auto *InitJD : DFSLinkOrder) { @@ -201,7 +199,7 @@ MachOPlatform::getInitializerSequence(JITDylib &JD) { RegisteredInitSymbols.erase(RISItr); } } - } + }); if (NewInitSymbols.empty()) break; @@ -228,7 +226,7 @@ MachOPlatform::getInitializerSequence(JITDylib &JD) { // Lock again to collect the initializers. InitializerSequence FullInitSeq; { - std::lock_guard Lock(PlatformMutex); + std::lock_guard Lock(InitSeqsMutex); for (auto *InitJD : reverse(DFSLinkOrder)) { LLVM_DEBUG({ dbgs() << "MachOPlatform: Appending inits for \"" << InitJD->getName() @@ -251,7 +249,7 @@ MachOPlatform::getDeinitializerSequence(JITDylib &JD) { DeinitializerSequence FullDeinitSeq; { - std::lock_guard Lock(PlatformMutex); + std::lock_guard Lock(InitSeqsMutex); for (auto *DeinitJD : DFSLinkOrder) { FullDeinitSeq.emplace_back(DeinitJD, MachOJITDylibDeinitializers()); } @@ -285,7 +283,7 @@ void MachOPlatform::registerInitInfo( MachOJITDylibInitializers::SectionExtent ModInits, MachOJITDylibInitializers::SectionExtent ObjCSelRefs, MachOJITDylibInitializers::SectionExtent ObjCClassList) { - std::lock_guard Lock(PlatformMutex); + std::lock_guard Lock(InitSeqsMutex); auto &InitSeq = InitSeqs[&JD]; @@ -384,7 +382,7 @@ void MachOPlatform::InitScraperPlugin::modifyPassConfig( else dbgs() << "none\n"; - dbgs() << "__mod_init_func: "; + dbgs() << " __mod_init_func: "; if (ModInits.NumPtrs) dbgs() << ModInits.NumPtrs << " pointer(s) at " << formatv("{0:x16}", ModInits.Address) << "\n"; diff --git a/llvm/lib/FuzzMutate/Operations.cpp b/llvm/lib/FuzzMutate/Operations.cpp index cf55d09caf7e63..43255e16140adb 100644 --- a/llvm/lib/FuzzMutate/Operations.cpp +++ b/llvm/lib/FuzzMutate/Operations.cpp @@ -244,20 +244,24 @@ static SourcePred matchScalarInAggregate() { static SourcePred validInsertValueIndex() { auto Pred = [](ArrayRef Cur, const Value *V) { - auto *CTy = cast(Cur[0]->getType()); if (auto *CI = dyn_cast(V)) - if (CI->getBitWidth() == 32 && - CTy->getTypeAtIndex(CI->getZExtValue()) == Cur[1]->getType()) - return true; + if (CI->getBitWidth() == 32) { + Type *Indexed = ExtractValueInst::getIndexedType(Cur[0]->getType(), + CI->getZExtValue()); + return Indexed == Cur[1]->getType(); + } return false; }; auto Make = [](ArrayRef Cur, ArrayRef Ts) { std::vector Result; auto *Int32Ty = Type::getInt32Ty(Cur[0]->getContext()); - auto *CTy = cast(Cur[0]->getType()); - for (int I = 0, E = getAggregateNumElements(CTy); I < E; ++I) - if (CTy->getTypeAtIndex(I) == Cur[1]->getType()) + auto *BaseTy = Cur[0]->getType(); + int I = 0; + while (Type *Indexed = ExtractValueInst::getIndexedType(BaseTy, I)) { + if (Indexed == Cur[1]->getType()) Result.push_back(ConstantInt::get(Int32Ty, I)); + ++I; + } return Result; }; return {Pred, Make}; diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp index dc78c5537befc5..3e2e74c31fc0a6 100644 --- a/llvm/lib/IR/ConstantFold.cpp +++ b/llvm/lib/IR/ConstantFold.cpp @@ -2389,10 +2389,11 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C, SmallVector NewIdxs; Type *Ty = PointeeTy; Type *Prev = C->getType(); + auto GEPIter = gep_type_begin(PointeeTy, Idxs); bool Unknown = !isa(Idxs[0]) && !isa(Idxs[0]); for (unsigned i = 1, e = Idxs.size(); i != e; - Prev = Ty, Ty = cast(Ty)->getTypeAtIndex(Idxs[i]), ++i) { + Prev = Ty, Ty = (++GEPIter).getIndexedType(), ++i) { if (!isa(Idxs[i]) && !isa(Idxs[i])) { // We don't know if it's in range or not. Unknown = true; diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp index c99b8b0023ca74..aff356ce3c82d6 100644 --- a/llvm/lib/IR/Constants.cpp +++ b/llvm/lib/IR/Constants.cpp @@ -1047,19 +1047,20 @@ static Constant *getSequenceIfElementsMatch(Constant *C, return nullptr; } -ConstantAggregate::ConstantAggregate(CompositeType *T, ValueTy VT, +ConstantAggregate::ConstantAggregate(Type *T, ValueTy VT, ArrayRef V) : Constant(T, VT, OperandTraits::op_end(this) - V.size(), V.size()) { llvm::copy(V, op_begin()); // Check that types match, unless this is an opaque struct. - if (auto *ST = dyn_cast(T)) + if (auto *ST = dyn_cast(T)) { if (ST->isOpaque()) return; - for (unsigned I = 0, E = V.size(); I != E; ++I) - assert(V[I]->getType() == T->getTypeAtIndex(I) && - "Initializer for composite element doesn't match!"); + for (unsigned I = 0, E = V.size(); I != E; ++I) + assert(V[I]->getType() == ST->getTypeAtIndex(I) && + "Initializer for struct element doesn't match!"); + } } ConstantArray::ConstantArray(ArrayType *T, ArrayRef V) diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index 68eed612e4bfde..0884a24a709e5d 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -1659,35 +1659,44 @@ GetElementPtrInst::GetElementPtrInst(const GetElementPtrInst &GEPI) SubclassOptionalData = GEPI.SubclassOptionalData; } -/// getIndexedType - Returns the type of the element that would be accessed with -/// a gep instruction with the specified parameters. -/// -/// The Idxs pointer should point to a continuous piece of memory containing the -/// indices, either as Value* or uint64_t. -/// -/// A null type is returned if the indices are invalid for the specified -/// pointer type. -/// -template -static Type *getIndexedTypeInternal(Type *Agg, ArrayRef IdxList) { - // Handle the special case of the empty set index set, which is always valid. - if (IdxList.empty()) - return Agg; - - // If there is at least one index, the top level type must be sized, otherwise - // it cannot be 'stepped over'. - if (!Agg->isSized()) +Type *GetElementPtrInst::getTypeAtIndex(Type *Ty, Value *Idx) { + if (auto Struct = dyn_cast(Ty)) { + if (!Struct->indexValid(Idx)) + return nullptr; + return Struct->getTypeAtIndex(Idx); + } + if (!Idx->getType()->isIntOrIntVectorTy()) return nullptr; + if (auto Array = dyn_cast(Ty)) + return Array->getElementType(); + if (auto Vector = dyn_cast(Ty)) + return Vector->getElementType(); + return nullptr; +} - unsigned CurIdx = 1; - for (; CurIdx != IdxList.size(); ++CurIdx) { - CompositeType *CT = dyn_cast(Agg); - if (!CT || CT->isPointerTy()) return nullptr; - IndexTy Index = IdxList[CurIdx]; - if (!CT->indexValid(Index)) return nullptr; - Agg = CT->getTypeAtIndex(Index); +Type *GetElementPtrInst::getTypeAtIndex(Type *Ty, uint64_t Idx) { + if (auto Struct = dyn_cast(Ty)) { + if (Idx >= Struct->getNumElements()) + return nullptr; + return Struct->getElementType(Idx); } - return CurIdx == IdxList.size() ? Agg : nullptr; + if (auto Array = dyn_cast(Ty)) + return Array->getElementType(); + if (auto Vector = dyn_cast(Ty)) + return Vector->getElementType(); + return nullptr; +} + +template +static Type *getIndexedTypeInternal(Type *Ty, ArrayRef IdxList) { + if (IdxList.empty()) + return Ty; + for (IndexTy V : IdxList.slice(1)) { + Ty = GetElementPtrInst::getTypeAtIndex(Ty, V); + if (!Ty) + return Ty; + } + return Ty; } Type *GetElementPtrInst::getIndexedType(Type *Ty, ArrayRef IdxList) { @@ -2220,15 +2229,15 @@ Type *ExtractValueInst::getIndexedType(Type *Agg, if (ArrayType *AT = dyn_cast(Agg)) { if (Index >= AT->getNumElements()) return nullptr; + Agg = AT->getElementType(); } else if (StructType *ST = dyn_cast(Agg)) { if (Index >= ST->getNumElements()) return nullptr; + Agg = ST->getElementType(Index); } else { // Not a valid type to index into. return nullptr; } - - Agg = cast(Agg)->getTypeAtIndex(Index); } return const_cast(Agg); } diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp index 78f98fd191c08d..1ce17aa63bdbfc 100644 --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -28,6 +28,8 @@ #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" + #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -178,6 +180,140 @@ bool ConstrainedFPIntrinsic::classof(const IntrinsicInst *I) { } } +ElementCount VPIntrinsic::getStaticVectorLength() const { + auto GetVectorLengthOfType = [](const Type *T) -> ElementCount { + auto VT = cast(T); + auto ElemCount = VT->getElementCount(); + return ElemCount; + }; + + auto VPMask = getMaskParam(); + return GetVectorLengthOfType(VPMask->getType()); +} + +Value *VPIntrinsic::getMaskParam() const { + auto maskPos = GetMaskParamPos(getIntrinsicID()); + if (maskPos) + return getArgOperand(maskPos.getValue()); + return nullptr; +} + +Value *VPIntrinsic::getVectorLengthParam() const { + auto vlenPos = GetVectorLengthParamPos(getIntrinsicID()); + if (vlenPos) + return getArgOperand(vlenPos.getValue()); + return nullptr; +} + +Optional VPIntrinsic::GetMaskParamPos(Intrinsic::ID IntrinsicID) { + switch (IntrinsicID) { + default: + return None; + +#define REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS) \ + case Intrinsic::VPID: \ + return MASKPOS; +#include "llvm/IR/VPIntrinsics.def" + } +} + +Optional VPIntrinsic::GetVectorLengthParamPos(Intrinsic::ID IntrinsicID) { + switch (IntrinsicID) { + default: + return None; + +#define REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS) \ + case Intrinsic::VPID: \ + return VLENPOS; +#include "llvm/IR/VPIntrinsics.def" + } +} + +bool VPIntrinsic::IsVPIntrinsic(Intrinsic::ID ID) { + switch (ID) { + default: + return false; + +#define REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS) \ + case Intrinsic::VPID: \ + break; +#include "llvm/IR/VPIntrinsics.def" + } + return true; +} + +// Equivalent non-predicated opcode +unsigned VPIntrinsic::GetFunctionalOpcodeForVP(Intrinsic::ID ID) { + switch (ID) { + default: + return Instruction::Call; + +#define HANDLE_VP_TO_OC(VPID, OC) \ + case Intrinsic::VPID: \ + return Instruction::OC; +#include "llvm/IR/VPIntrinsics.def" + } +} + +Intrinsic::ID VPIntrinsic::GetForOpcode(unsigned OC) { + switch (OC) { + default: + return Intrinsic::not_intrinsic; + +#define HANDLE_VP_TO_OC(VPID, OC) \ + case Instruction::OC: \ + return Intrinsic::VPID; +#include "llvm/IR/VPIntrinsics.def" + } +} + +bool VPIntrinsic::canIgnoreVectorLengthParam() const { + using namespace PatternMatch; + + ElementCount EC = getStaticVectorLength(); + + // No vlen param - no lanes masked-off by it. + auto *VLParam = getVectorLengthParam(); + if (!VLParam) + return true; + + // Note that the VP intrinsic causes undefined behavior if the Explicit Vector + // Length parameter is strictly greater-than the number of vector elements of + // the operation. This function returns true when this is detected statically + // in the IR. + + // Check whether "W == vscale * EC.Min" + if (EC.Scalable) { + // Undig the DL + auto ParMod = this->getModule(); + if (!ParMod) + return false; + const auto &DL = ParMod->getDataLayout(); + + // Compare vscale patterns + uint64_t ParamFactor; + if (EC.Min > 1 && + match(VLParam, m_c_BinOp(m_ConstantInt(ParamFactor), m_VScale(DL)))) { + return ParamFactor >= EC.Min; + } + if (match(VLParam, m_VScale(DL))) { + return ParamFactor; + } + return false; + } + + // standard SIMD operation + auto VLConst = dyn_cast(VLParam); + if (!VLConst) + return false; + + uint64_t VLNum = VLConst->getZExtValue(); + if (VLNum >= EC.Min) + return true; + + return false; +} + Instruction::BinaryOps BinaryOpIntrinsic::getBinaryOp() const { switch (getIntrinsicID()) { case Intrinsic::uadd_with_overflow: diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp index 3eab5042b54248..e91bc8aa7e708b 100644 --- a/llvm/lib/IR/Type.cpp +++ b/llvm/lib/IR/Type.cpp @@ -529,52 +529,22 @@ StructType *Module::getTypeByName(StringRef Name) const { return getContext().pImpl->NamedStructTypes.lookup(Name); } -//===----------------------------------------------------------------------===// -// CompositeType Implementation -//===----------------------------------------------------------------------===// - -Type *CompositeType::getTypeAtIndex(const Value *V) const { - if (auto *STy = dyn_cast(this)) { - unsigned Idx = - (unsigned)cast(V)->getUniqueInteger().getZExtValue(); - assert(indexValid(Idx) && "Invalid structure index!"); - return STy->getElementType(Idx); - } - - return cast(this)->getElementType(); +Type *StructType::getTypeAtIndex(const Value *V) const { + unsigned Idx = (unsigned)cast(V)->getUniqueInteger().getZExtValue(); + assert(indexValid(Idx) && "Invalid structure index!"); + return getElementType(Idx); } -Type *CompositeType::getTypeAtIndex(unsigned Idx) const{ - if (auto *STy = dyn_cast(this)) { - assert(indexValid(Idx) && "Invalid structure index!"); - return STy->getElementType(Idx); - } - - return cast(this)->getElementType(); -} - -bool CompositeType::indexValid(const Value *V) const { - if (auto *STy = dyn_cast(this)) { - // Structure indexes require (vectors of) 32-bit integer constants. In the - // vector case all of the indices must be equal. - if (!V->getType()->isIntOrIntVectorTy(32)) - return false; - const Constant *C = dyn_cast(V); - if (C && V->getType()->isVectorTy()) - C = C->getSplatValue(); - const ConstantInt *CU = dyn_cast_or_null(C); - return CU && CU->getZExtValue() < STy->getNumElements(); - } - - // Sequential types can be indexed by any integer. - return V->getType()->isIntOrIntVectorTy(); -} - -bool CompositeType::indexValid(unsigned Idx) const { - if (auto *STy = dyn_cast(this)) - return Idx < STy->getNumElements(); - // Sequential types can be indexed by any integer. - return true; +bool StructType::indexValid(const Value *V) const { + // Structure indexes require (vectors of) 32-bit integer constants. In the + // vector case all of the indices must be equal. + if (!V->getType()->isIntOrIntVectorTy(32)) + return false; + const Constant *C = dyn_cast(V); + if (C && V->getType()->isVectorTy()) + C = C->getSplatValue(); + const ConstantInt *CU = dyn_cast_or_null(C); + return CU && CU->getZExtValue() < getNumElements(); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp index 8cd589759eab4f..962e67437280eb 100644 --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -666,7 +666,7 @@ uint64_t Value::getPointerDereferenceableBytes(const DataLayout &DL, if (DerefBytes == 0 && (A->hasByValAttr() || A->hasStructRetAttr())) { Type *PT = cast(A->getType())->getElementType(); if (PT->isSized()) - DerefBytes = DL.getTypeStoreSize(PT); + DerefBytes = DL.getTypeStoreSize(PT).getKnownMinSize(); } if (DerefBytes == 0) { DerefBytes = A->getDereferenceableOrNullBytes(); @@ -707,14 +707,15 @@ uint64_t Value::getPointerDereferenceableBytes(const DataLayout &DL, } } else if (auto *AI = dyn_cast(this)) { if (!AI->isArrayAllocation()) { - DerefBytes = DL.getTypeStoreSize(AI->getAllocatedType()); + DerefBytes = + DL.getTypeStoreSize(AI->getAllocatedType()).getKnownMinSize(); CanBeNull = false; } } else if (auto *GV = dyn_cast(this)) { if (GV->getValueType()->isSized() && !GV->hasExternalWeakLinkage()) { // TODO: Don't outright reject hasExternalWeakLinkage but set the // CanBeNull flag. - DerefBytes = DL.getTypeStoreSize(GV->getValueType()); + DerefBytes = DL.getTypeStoreSize(GV->getValueType()).getFixedSize(); CanBeNull = false; } } diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc index d634c123fbdc59..ee199aa1d4b104 100644 --- a/llvm/lib/Support/Windows/Path.inc +++ b/llvm/lib/Support/Windows/Path.inc @@ -47,7 +47,7 @@ using namespace llvm; using llvm::sys::windows::UTF8ToUTF16; using llvm::sys::windows::CurCPToUTF16; using llvm::sys::windows::UTF16ToUTF8; -using llvm::sys::path::widenPath; +using llvm::sys::windows::widenPath; static bool is_separator(const wchar_t value) { switch (value) { @@ -61,64 +61,70 @@ static bool is_separator(const wchar_t value) { namespace llvm { namespace sys { -namespace path { +namespace windows { -// Convert a UTF-8 path to UTF-16. Also, if the absolute equivalent of the -// path is longer than CreateDirectory can tolerate, make it absolute and -// prefixed by '\\?\'. -std::error_code widenPath(const Twine &Path8, - SmallVectorImpl &Path16) { - const size_t MaxDirLen = MAX_PATH - 12; // Must leave room for 8.3 filename. +// Convert a UTF-8 path to UTF-16. Also, if the absolute equivalent of the path +// is longer than the limit that the Win32 Unicode File API can tolerate, make +// it an absolute normalized path prefixed by '\\?\'. +std::error_code widenPath(const Twine &Path8, SmallVectorImpl &Path16, + size_t MaxPathLen) { + assert(MaxPathLen <= MAX_PATH); - // Several operations would convert Path8 to SmallString; more efficient to - // do it once up front. - SmallString<128> Path8Str; + // Several operations would convert Path8 to SmallString; more efficient to do + // it once up front. + SmallString Path8Str; Path8.toVector(Path8Str); - // If we made this path absolute, how much longer would it get? + std::error_code EC = UTF8ToUTF16(Path8Str, Path16); + if (EC) + return EC; + + const bool IsAbsolute = llvm::sys::path::is_absolute(Path8); size_t CurPathLen; - if (llvm::sys::path::is_absolute(Twine(Path8Str))) + if (IsAbsolute) CurPathLen = 0; // No contribution from current_path needed. else { - CurPathLen = ::GetCurrentDirectoryW(0, NULL); + CurPathLen = ::GetCurrentDirectoryW( + 0, NULL); // Returns the size including the null terminator. if (CurPathLen == 0) return mapWindowsError(::GetLastError()); } - // Would the absolute path be longer than our limit? - if ((Path8Str.size() + CurPathLen) >= MaxDirLen && - !Path8Str.startswith("\\\\?\\")) { - SmallString<2*MAX_PATH> FullPath("\\\\?\\"); - if (CurPathLen) { - SmallString<80> CurPath; - if (std::error_code EC = llvm::sys::fs::current_path(CurPath)) - return EC; - FullPath.append(CurPath); - } - // Traverse the requested path, canonicalizing . and .. (because the \\?\ - // prefix is documented to treat them as real components). Ignore - // separators, which can be returned from the iterator if the path has a - // drive name. We don't need to call native() on the result since append() - // always attaches preferred_separator. - for (llvm::sys::path::const_iterator I = llvm::sys::path::begin(Path8Str), - E = llvm::sys::path::end(Path8Str); - I != E; ++I) { - if (I->size() == 1 && is_separator((*I)[0])) - continue; - if (I->size() == 1 && *I == ".") - continue; - if (I->size() == 2 && *I == "..") - llvm::sys::path::remove_filename(FullPath); - else - llvm::sys::path::append(FullPath, *I); - } - return UTF8ToUTF16(FullPath, Path16); + const char *const LongPathPrefix = "\\\\?\\"; + + if ((Path16.size() + CurPathLen) < MaxPathLen || + Path8Str.startswith(LongPathPrefix)) + return std::error_code(); + + if (!IsAbsolute) { + if (EC = llvm::sys::fs::make_absolute(Path8Str)) + return EC; } - // Just use the caller's original path. - return UTF8ToUTF16(Path8Str, Path16); + // Remove '.' and '..' because long paths treat these as real path components. + llvm::sys::path::remove_dots(Path8Str, true); + + const StringRef RootName = llvm::sys::path::root_name(Path8Str); + assert(!RootName.empty() && + "Root name cannot be empty for an absolute path!"); + + // llvm::sys::path::remove_dots, used above, can leave a '/' after the root + // name and long paths must use '\' as the separator. + const size_t RootNameSize = RootName.size(); + if (RootNameSize < Path8Str.size() && Path8Str[RootNameSize] == '/') + Path8Str[RootNameSize] = '\\'; + + SmallString<2 * MAX_PATH> FullPath(LongPathPrefix); + if (RootName[1] != ':') { // Check if UNC. + FullPath.append("UNC\\"); + FullPath.append(Path8Str.begin() + 2, Path8Str.end()); + } else + FullPath.append(Path8Str); + + return UTF8ToUTF16(FullPath, Path16); } -} // end namespace path + +} // end namespace windows namespace fs { @@ -227,7 +233,9 @@ std::error_code create_directory(const Twine &path, bool IgnoreExisting, perms Perms) { SmallVector path_utf16; - if (std::error_code ec = widenPath(path, path_utf16)) + // CreateDirectoryW has a lower maximum path length as it must leave room for + // an 8.3 filename. + if (std::error_code ec = widenPath(path, path_utf16, MAX_PATH - 12)) return ec; if (!::CreateDirectoryW(path_utf16.begin(), NULL)) { diff --git a/llvm/lib/Support/Windows/Program.inc b/llvm/lib/Support/Windows/Program.inc index f20538e40cc0ff..48954ba047356c 100644 --- a/llvm/lib/Support/Windows/Program.inc +++ b/llvm/lib/Support/Windows/Program.inc @@ -151,7 +151,7 @@ static HANDLE RedirectIO(Optional Path, int fd, if (windows::UTF8ToUTF16(fname, fnameUnicode)) return INVALID_HANDLE_VALUE; } else { - if (path::widenPath(fname, fnameUnicode)) + if (sys::windows::widenPath(fname, fnameUnicode)) return INVALID_HANDLE_VALUE; } h = CreateFileW(fnameUnicode.data(), fd ? GENERIC_WRITE : GENERIC_READ, @@ -263,7 +263,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program, fflush(stderr); SmallVector ProgramUtf16; - if (std::error_code ec = path::widenPath(Program, ProgramUtf16)) { + if (std::error_code ec = sys::windows::widenPath(Program, ProgramUtf16)) { SetLastError(ec.value()); MakeErrMsg(ErrMsg, std::string("Unable to convert application name to UTF-16")); diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 050d9830e402d8..4a0b1b4d8502bb 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -412,6 +412,7 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( } LLVM_FALLTHROUGH; case AArch64::DestructiveBinary: + case AArch64::DestructiveBinaryImm: std::tie(PredIdx, DOPIdx, SrcIdx) = std::make_tuple(1, 2, 3); break; default: @@ -430,6 +431,9 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( DstReg != MI.getOperand(DOPIdx).getReg() || MI.getOperand(DOPIdx).getReg() != MI.getOperand(SrcIdx).getReg(); break; + case AArch64::DestructiveBinaryImm: + DOPRegIsUnique = true; + break; } assert (DOPRegIsUnique && "The destructive operand should be unique"); @@ -498,6 +502,7 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)); switch (DType) { + case AArch64::DestructiveBinaryImm: case AArch64::DestructiveBinaryComm: case AArch64::DestructiveBinaryCommWithRev: DOP.add(MI.getOperand(PredIdx)) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 7b189dac4abab5..ab9176a8e77a31 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12995,9 +12995,9 @@ static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops); } -/// Combines a node carrying the intrinsic `aarch64_sve_gather_prf` into a -/// node that uses `aarch64_sve_gather_prf_scaled_uxtw` when the scalar -/// offset passed to `aarch64_sve_gather_prf` is not a valid immediate for +/// Combines a node carrying the intrinsic `aarch64_sve_prf_gather` into a +/// node that uses `aarch64_sve_prf_gather_scaled_uxtw` when the scalar +/// offset passed to `aarch64_sve_prf_gather` is not a valid immediate for /// the sve gather prefetch instruction with vector plus immediate addressing /// mode. static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, @@ -13011,8 +13011,8 @@ static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, // ...otherwise swap the offset base with the offset... SmallVector Ops(N->op_begin(), N->op_end()); std::swap(Ops[ImmPos], Ops[OffsetPos]); - // ...and remap the intrinsic `aarch64_sve_gather_prf` to - // `aarch64_sve_gather_prf_scaled_uxtw`. + // ...and remap the intrinsic `aarch64_sve_prf_gather` to + // `aarch64_sve_prf_gather_scaled_uxtw`. SDLoc DL(N); Ops[1] = DAG.getConstant(NewIID, DL, MVT::i64); @@ -13083,30 +13083,30 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { - case Intrinsic::aarch64_sve_gather_prfb: + case Intrinsic::aarch64_sve_prfb_gather: return combineSVEPrefetchVecBaseImmOff( - N, DAG, Intrinsic::aarch64_sve_gather_prfb_scaled_uxtw, + N, DAG, Intrinsic::aarch64_sve_prfb_gather_scaled_uxtw, 1 /*=ScalarSizeInBytes*/); - case Intrinsic::aarch64_sve_gather_prfh: + case Intrinsic::aarch64_sve_prfh_gather: return combineSVEPrefetchVecBaseImmOff( - N, DAG, Intrinsic::aarch64_sve_gather_prfh_scaled_uxtw, + N, DAG, Intrinsic::aarch64_sve_prfh_gather_scaled_uxtw, 2 /*=ScalarSizeInBytes*/); - case Intrinsic::aarch64_sve_gather_prfw: + case Intrinsic::aarch64_sve_prfw_gather: return combineSVEPrefetchVecBaseImmOff( - N, DAG, Intrinsic::aarch64_sve_gather_prfw_scaled_uxtw, + N, DAG, Intrinsic::aarch64_sve_prfw_gather_scaled_uxtw, 4 /*=ScalarSizeInBytes*/); - case Intrinsic::aarch64_sve_gather_prfd: + case Intrinsic::aarch64_sve_prfd_gather: return combineSVEPrefetchVecBaseImmOff( - N, DAG, Intrinsic::aarch64_sve_gather_prfd_scaled_uxtw, + N, DAG, Intrinsic::aarch64_sve_prfd_gather_scaled_uxtw, 8 /*=ScalarSizeInBytes*/); - case Intrinsic::aarch64_sve_gather_prfb_scaled_uxtw: - case Intrinsic::aarch64_sve_gather_prfb_scaled_sxtw: - case Intrinsic::aarch64_sve_gather_prfh_scaled_uxtw: - case Intrinsic::aarch64_sve_gather_prfh_scaled_sxtw: - case Intrinsic::aarch64_sve_gather_prfw_scaled_uxtw: - case Intrinsic::aarch64_sve_gather_prfw_scaled_sxtw: - case Intrinsic::aarch64_sve_gather_prfd_scaled_uxtw: - case Intrinsic::aarch64_sve_gather_prfd_scaled_sxtw: + case Intrinsic::aarch64_sve_prfb_gather_scaled_uxtw: + case Intrinsic::aarch64_sve_prfb_gather_scaled_sxtw: + case Intrinsic::aarch64_sve_prfh_gather_scaled_uxtw: + case Intrinsic::aarch64_sve_prfh_gather_scaled_sxtw: + case Intrinsic::aarch64_sve_prfw_gather_scaled_uxtw: + case Intrinsic::aarch64_sve_prfw_gather_scaled_sxtw: + case Intrinsic::aarch64_sve_prfd_gather_scaled_uxtw: + case Intrinsic::aarch64_sve_prfd_gather_scaled_sxtw: return legalizeSVEGatherPrefetchOffsVec(N, DAG); case Intrinsic::aarch64_neon_ld2: case Intrinsic::aarch64_neon_ld3: diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index d3a541d0246b57..7395f24f2118df 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -880,37 +880,37 @@ multiclass sve_prefetch; - defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16, int_aarch64_sve_gather_prfh_scaled_sxtw, int_aarch64_sve_gather_prfh_scaled_uxtw>; - defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32, int_aarch64_sve_gather_prfw_scaled_sxtw, int_aarch64_sve_gather_prfw_scaled_uxtw>; - defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64, int_aarch64_sve_gather_prfd_scaled_sxtw, int_aarch64_sve_gather_prfd_scaled_uxtw>; + defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, int_aarch64_sve_prfb_gather_scaled_sxtw, int_aarch64_sve_prfb_gather_scaled_uxtw>; + defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16, int_aarch64_sve_prfh_gather_scaled_sxtw, int_aarch64_sve_prfh_gather_scaled_uxtw>; + defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32, int_aarch64_sve_prfw_gather_scaled_sxtw, int_aarch64_sve_prfw_gather_scaled_uxtw>; + defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64, int_aarch64_sve_prfd_gather_scaled_sxtw, int_aarch64_sve_prfd_gather_scaled_uxtw>; // Gather prefetch using unpacked, scaled 32-bit offsets, e.g. // prfh pldl1keep, p0, [x0, z0.d, uxtw #1] - defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, int_aarch64_sve_gather_prfb_scaled_sxtw, int_aarch64_sve_gather_prfb_scaled_uxtw>; - defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16, int_aarch64_sve_gather_prfh_scaled_sxtw, int_aarch64_sve_gather_prfh_scaled_uxtw>; - defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32, int_aarch64_sve_gather_prfw_scaled_sxtw, int_aarch64_sve_gather_prfw_scaled_uxtw>; - defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64, int_aarch64_sve_gather_prfd_scaled_sxtw, int_aarch64_sve_gather_prfd_scaled_uxtw>; + defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, int_aarch64_sve_prfb_gather_scaled_sxtw, int_aarch64_sve_prfb_gather_scaled_uxtw>; + defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16, int_aarch64_sve_prfh_gather_scaled_sxtw, int_aarch64_sve_prfh_gather_scaled_uxtw>; + defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32, int_aarch64_sve_prfw_gather_scaled_sxtw, int_aarch64_sve_prfw_gather_scaled_uxtw>; + defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64, int_aarch64_sve_prfd_gather_scaled_sxtw, int_aarch64_sve_prfd_gather_scaled_uxtw>; // Gather prefetch using scaled 64-bit offsets, e.g. // prfh pldl1keep, p0, [x0, z0.d, lsl #1] - defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8, int_aarch64_sve_gather_prfb_scaled>; - defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16, int_aarch64_sve_gather_prfh_scaled>; - defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32, int_aarch64_sve_gather_prfw_scaled>; - defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64, int_aarch64_sve_gather_prfd_scaled>; + defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8, int_aarch64_sve_prfb_gather_scaled>; + defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16, int_aarch64_sve_prfh_gather_scaled>; + defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32, int_aarch64_sve_prfw_gather_scaled>; + defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64, int_aarch64_sve_prfd_gather_scaled>; // Gather prefetch using 32/64-bit pointers with offset, e.g. // prfh pldl1keep, p0, [z0.s, #16] // prfh pldl1keep, p0, [z0.d, #16] - defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_gather_prfb>; - defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_gather_prfh>; - defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_gather_prfw>; - defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_gather_prfd>; + defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_prfb_gather>; + defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_prfh_gather>; + defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_prfw_gather>; + defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_prfd_gather>; - defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_gather_prfb>; - defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_gather_prfh>; - defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_gather_prfw>; - defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_gather_prfd>; + defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_prfb_gather>; + defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_prfh_gather>; + defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_prfw_gather>; + defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_prfd_gather>; defm ADR_SXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_sxtw<0b00, "adr">; defm ADR_UXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_uxtw<0b01, "adr">; @@ -1131,17 +1131,22 @@ multiclass sve_prefetch; // Predicated shifts - defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0000, "asr">; - defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0001, "lsr">; + defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0000, "asr", "ASR_ZPZI">; + defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0001, "lsr", "LSR_ZPZI">; defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0011, "lsl">; - defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd", int_aarch64_sve_asrd>; + defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd", "ASRD_ZPZI", int_aarch64_sve_asrd>; - defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", int_aarch64_sve_asr>; - defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr", int_aarch64_sve_lsr>; - defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl", int_aarch64_sve_lsl>; - defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr", null_frag>; - defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr", null_frag>; - defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr", null_frag>; + defm ASR_ZPZZ : sve_int_bin_pred_zx; + defm LSR_ZPZZ : sve_int_bin_pred_zx; + defm LSL_ZPZZ : sve_int_bin_pred_zx; + defm ASRD_ZPZI : sve_int_bin_pred_shift_0_right_zx; + + defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", "ASR_ZPZZ", int_aarch64_sve_asr, "ASRR_ZPmZ", 1>; + defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr", "LSR_ZPZZ", int_aarch64_sve_lsr, "LSRR_ZPmZ", 1>; + defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl", "LSL_ZPZZ", int_aarch64_sve_lsl, "LSLR_ZPmZ", 1>; + defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr", "ASRR_ZPZZ", null_frag, "ASR_ZPmZ", 0>; + defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr", "LSRR_ZPZZ", null_frag, "LSR_ZPmZ", 0>; + defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr", "LSLR_ZPZZ", null_frag, "LSL_ZPmZ", 0>; defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr", int_aarch64_sve_asr_wide>; defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr", int_aarch64_sve_lsr_wide>; @@ -1777,10 +1782,10 @@ let Predicates = [HasSVE2] in { defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr", null_frag>; // SVE2 predicated shifts - defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">; - defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl">; - defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr", int_aarch64_sve_srshr>; - defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr", int_aarch64_sve_urshr>; + defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl", "SQSHL_ZPZI">; + defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl", "UQSHL_ZPZI">; + defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr", "SRSHR_ZPZI", int_aarch64_sve_srshr>; + defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr", "URSHR_ZPZI", int_aarch64_sve_urshr>; defm SQSHLU_ZPmI : sve2_int_bin_pred_shift_imm_left< 0b1111, "sqshlu", int_aarch64_sve_sqshlu>; // SVE2 integer add/subtract long diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index a5676d286ebe0e..62ae04a0780811 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -309,6 +309,9 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT, // AArch64 supports default outlining behaviour. setSupportsDefaultOutlining(true); + + // AArch64 supports the debug entry values. + setSupportsDebugEntryValues(true); } AArch64TargetMachine::~AArch64TargetMachine() = default; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 3937d6390c4da1..6b4924b8f2259c 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -375,6 +375,12 @@ class SVE_3_Op_Pat_SelZero : Pat<(vtd (vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), vt3:$Op3))), (inst $Op1, $Op2, $Op3)>; + +class SVE_3_Op_Pat_Shift_Imm_SelZero +: Pat<(vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), (i32 (vt3:$Op3)))), + (inst $Op1, $Op2, vt3:$Op3)>; } // @@ -433,6 +439,13 @@ let hasNoSchedulingInfo = 1 in { Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, zprty:$Zs2), []> { let FalseLanes = flags; } + + class PredTwoOpImmPseudo + : SVEPseudo2Instr, + Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, immty:$imm), []> { + let FalseLanes = flags; + } } //===----------------------------------------------------------------------===// @@ -4692,19 +4705,23 @@ class sve_int_bin_pred_shift_imm tsz8_64, bits<4> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = DestructiveOther; + let DestructiveInstType = DestructiveBinaryImm; let ElementSize = zprty.ElementSize; } -multiclass sve_int_bin_pred_shift_imm_left opc, string asm> { - def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>; - def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> { +multiclass sve_int_bin_pred_shift_imm_left opc, string asm, string psName=""> { + def _B : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>; + def _H : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> { let Inst{8} = imm{3}; } - def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> { + def _S : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> { let Inst{9-8} = imm{4-3}; } - def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> { + def _D : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> { let Inst{22} = imm{5}; let Inst{9-8} = imm{4-3}; } @@ -4730,16 +4747,20 @@ multiclass sve2_int_bin_pred_shift_imm_left opc, string asm, def : SVE_3_Op_Imm_Pat(NAME # _D)>; } -multiclass sve_int_bin_pred_shift_imm_right opc, string asm, +multiclass sve_int_bin_pred_shift_imm_right opc, string asm, string Ps, SDPatternOperator op = null_frag> { - def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; - def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { + def _B : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; + def _H : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { let Inst{8} = imm{3}; } - def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> { + def _S : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> { let Inst{9-8} = imm{4-3}; } - def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> { + def _D : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> { let Inst{22} = imm{5}; let Inst{9-8} = imm{4-3}; } @@ -4750,6 +4771,18 @@ multiclass sve_int_bin_pred_shift_imm_right opc, string asm, def : SVE_3_Op_Imm_Pat(NAME # _D)>; } +multiclass sve_int_bin_pred_shift_0_right_zx { + def _ZERO_B : PredTwoOpImmPseudo; + def _ZERO_H : PredTwoOpImmPseudo; + def _ZERO_S : PredTwoOpImmPseudo; + def _ZERO_D : PredTwoOpImmPseudo; + + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_B)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_H)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_S)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_D)>; +} + class sve_int_bin_pred_shift sz8_64, bit wide, bits<3> opc, string asm, ZPRRegOp zprty, ZPRRegOp zprty2> : I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty2:$Zm), @@ -4774,19 +4807,36 @@ class sve_int_bin_pred_shift sz8_64, bit wide, bits<3> opc, let ElementSize = zprty.ElementSize; } -multiclass sve_int_bin_pred_shift opc, string asm, - SDPatternOperator op> { - def _B : sve_int_bin_pred_shift<0b00, 0b0, opc, asm, ZPR8, ZPR8>; - def _H : sve_int_bin_pred_shift<0b01, 0b0, opc, asm, ZPR16, ZPR16>; - def _S : sve_int_bin_pred_shift<0b10, 0b0, opc, asm, ZPR32, ZPR32>; - def _D : sve_int_bin_pred_shift<0b11, 0b0, opc, asm, ZPR64, ZPR64>; - +multiclass sve_int_bin_pred_shift opc, string asm, string Ps, + SDPatternOperator op, string revname, bit isOrig> { + let DestructiveInstType = DestructiveBinaryCommWithRev in { + def _B : sve_int_bin_pred_shift<0b00, 0b0, opc, asm, ZPR8, ZPR8>, + SVEPseudo2Instr, SVEInstr2Rev; + def _H : sve_int_bin_pred_shift<0b01, 0b0, opc, asm, ZPR16, ZPR16>, + SVEPseudo2Instr, SVEInstr2Rev; + def _S : sve_int_bin_pred_shift<0b10, 0b0, opc, asm, ZPR32, ZPR32>, + SVEPseudo2Instr, SVEInstr2Rev; + def _D : sve_int_bin_pred_shift<0b11, 0b0, opc, asm, ZPR64, ZPR64>, + SVEPseudo2Instr, SVEInstr2Rev; + } def : SVE_3_Op_Pat(NAME # _B)>; def : SVE_3_Op_Pat(NAME # _H)>; def : SVE_3_Op_Pat(NAME # _S)>; def : SVE_3_Op_Pat(NAME # _D)>; } +multiclass sve_int_bin_pred_zx { + def _ZERO_B : PredTwoOpPseudo; + def _ZERO_H : PredTwoOpPseudo; + def _ZERO_S : PredTwoOpPseudo; + def _ZERO_D : PredTwoOpPseudo; + + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_B)>; + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_H)>; + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_S)>; + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_D)>; +} + multiclass sve_int_bin_pred_shift_wide opc, string asm, SDPatternOperator op> { def _B : sve_int_bin_pred_shift<0b00, 0b1, opc, asm, ZPR8, ZPR64>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h index d3135593bf1f5f..112934fd133e1b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -159,13 +159,14 @@ struct AMDGPUFunctionArgInfo { class AMDGPUArgumentUsageInfo : public ImmutablePass { private: - static const AMDGPUFunctionArgInfo ExternFunctionInfo; - static const AMDGPUFunctionArgInfo FixedABIFunctionInfo; DenseMap ArgInfoMap; public: static char ID; + static const AMDGPUFunctionArgInfo ExternFunctionInfo; + static const AMDGPUFunctionArgInfo FixedABIFunctionInfo; + AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { } void getAnalysisUsage(AnalysisUsage &AU) const override { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 733db1eae80d21..63f7590217dfcb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -601,6 +601,15 @@ int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumVGPRs( return std::max(NumVGPR, NumAGPR); } +static const Function *getCalleeFunction(const MachineOperand &Op) { + if (Op.isImm()) { + assert(Op.getImm() == 0); + return nullptr; + } + + return cast(Op.getGlobal()); +} + AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( const MachineFunction &MF) const { SIFunctionResourceInfo Info; @@ -853,8 +862,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( const MachineOperand *CalleeOp = TII->getNamedOperand(MI, AMDGPU::OpName::callee); - const Function *Callee = cast(CalleeOp->getGlobal()); - if (Callee->isDeclaration()) { + + const Function *Callee = getCalleeFunction(*CalleeOp); + if (!Callee || Callee->isDeclaration()) { // If this is a call to an external function, we can't do much. Make // conservative guesses. @@ -897,7 +907,8 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( Info.HasRecursion |= I->second.HasRecursion; } - if (!Callee->doesNotRecurse()) + // FIXME: Call site could have norecurse on it + if (!Callee || !Callee->doesNotRecurse()) Info.HasRecursion = true; } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 3a402d11bb77f4..7f6971a32ca914 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -3470,6 +3470,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { + B.setInstr(MI); + const int NumDefs = MI.getNumExplicitDefs(); bool IsTFE = NumDefs == 2; // We are only processing the operands of d16 image operations on subtargets @@ -3479,18 +3481,6 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); - Observer.changingInstr(MI); - auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); - - - unsigned NewOpcode = NumDefs == 0 ? - AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; - - // Track that we legalized this - MI.setDesc(B.getTII().get(NewOpcode)); - - B.setInstr(MI); - MachineRegisterInfo *MRI = B.getMRI(); const LLT S32 = LLT::scalar(32); const LLT S16 = LLT::scalar(16); @@ -3506,6 +3496,41 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( int NumVAddrs, NumGradients; std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); + const int DMaskIdx = BaseOpcode->Atomic ? -1 : + getDMaskIdx(BaseOpcode, NumDefs); + unsigned DMask = 0; + + int DMaskLanes = 0; + if (!BaseOpcode->Atomic) { + DMask = MI.getOperand(DMaskIdx).getImm(); + if (BaseOpcode->Gather4) { + DMaskLanes = 4; + } else if (DMask != 0) { + DMaskLanes = countPopulation(DMask); + } else if (!IsTFE && !BaseOpcode->Store) { + // If dmask is 0, this is a no-op load. This can be eliminated. + B.buildUndef(MI.getOperand(0)); + MI.eraseFromParent(); + return true; + } + } + + Observer.changingInstr(MI); + auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); + + unsigned NewOpcode = NumDefs == 0 ? + AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; + + // Track that we legalized this + MI.setDesc(B.getTII().get(NewOpcode)); + + // Expecting to get an error flag since TFC is on - and dmask is 0 Force + // dmask to be at least 1 otherwise the instruction will fail + if (IsTFE && DMask == 0) { + DMask = 0x1; + DMaskLanes = 1; + MI.getOperand(DMaskIdx).setImm(DMask); + } // If the register allocator cannot place the address registers contiguously // without introducing moves, then using the non-sequential address encoding @@ -3556,13 +3581,6 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); } - int DMaskLanes = 0; - if (!BaseOpcode->Atomic) { - const int DMaskIdx = getDMaskIdx(BaseOpcode, NumDefs); - unsigned DMask = MI.getOperand(DMaskIdx).getImm(); - DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask); - } - if (BaseOpcode->Store) { // No TFE for stores? // TODO: Handle dmask trim Register VData = MI.getOperand(1).getReg(); diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 893fa43fcca8ee..e814c47a8f792f 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -184,11 +184,13 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, .addMemOperand(MMO); } -void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, - MachineFunction &MF, - MachineBasicBlock &MBB) const { +// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()` +void SIFrameLowering::emitEntryFunctionFlatScratchInit( + MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + const DebugLoc &DL, Register ScratchWaveOffsetReg) const { + const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo* TRI = &TII->getRegisterInfo(); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); const SIMachineFunctionInfo *MFI = MF.getInfo(); // We don't need this if we only have spills since there is no user facing @@ -201,11 +203,6 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, // pointer. Because we only detect if flat instructions are used at all, // this will be used more often than necessary on VI. - // Debug location must be unknown since the first debug location is used to - // determine the end of the prologue. - DebugLoc DL; - MachineBasicBlock::iterator I = MBB.begin(); - Register FlatScratchInitReg = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); @@ -216,8 +213,6 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); - unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); - // Do a 64-bit pointer add. if (ST.flatScratchIsPointer()) { if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { @@ -266,16 +261,20 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, .addImm(8); } -unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( - const GCNSubtarget &ST, - const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, - MachineFunction &MF) const { +// Shift down registers reserved for the scratch RSRC. +Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( + MachineFunction &MF) const { + + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); + SIMachineFunctionInfo *MFI = MF.getInfo(); + + assert(MFI->isEntryFunction()); + + Register ScratchRsrcReg = MFI->getScratchRSrcReg(); - // We need to insert initialization of the scratch resource descriptor. - unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); if (ScratchRsrcReg == AMDGPU::NoRegister || !MRI.isPhysRegUsed(ScratchRsrcReg)) return AMDGPU::NoRegister; @@ -315,29 +314,35 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( } // Shift down registers reserved for the scratch wave offset. -std::pair -SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( - const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, MachineFunction &MF) const { +Register SIFrameLowering::getEntryFunctionReservedScratchWaveOffsetReg( + MachineFunction &MF) const { + + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + SIMachineFunctionInfo *MFI = MF.getInfo(); assert(MFI->isEntryFunction()); - // No replacement necessary. + Register ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + if (ScratchWaveOffsetReg == AMDGPU::NoRegister || - (!hasFP(MF) && !MRI.isPhysRegUsed(ScratchWaveOffsetReg))) { - return std::make_pair(AMDGPU::NoRegister, false); + (!MRI.isPhysRegUsed(ScratchWaveOffsetReg) && !hasFP(MF) && + !MFI->hasFlatScratchInit())) { + assert(!hasFP(MF) && !MFI->hasFlatScratchInit()); + return AMDGPU::NoRegister; } - if (ST.hasSGPRInitBug()) - return std::make_pair(ScratchWaveOffsetReg, false); + if (ST.hasSGPRInitBug() || + ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) + return ScratchWaveOffsetReg; unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); ArrayRef AllSGPRs = getAllSGPRs(ST, MF); if (NumPreloaded > AllSGPRs.size()) - return std::make_pair(ScratchWaveOffsetReg, false); + return ScratchWaveOffsetReg; AllSGPRs = AllSGPRs.slice(NumPreloaded); @@ -358,90 +363,78 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( unsigned ReservedRegCount = 13; if (AllSGPRs.size() < ReservedRegCount) - return std::make_pair(ScratchWaveOffsetReg, false); - - bool HandledScratchWaveOffsetReg = - ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); - bool FPAdjusted = false; + return ScratchWaveOffsetReg; for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) { // Pick the first unallocated SGPR. Be careful not to pick an alias of the // scratch descriptor, since we haven’t added its uses yet. if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { - if (!HandledScratchWaveOffsetReg) { - HandledScratchWaveOffsetReg = true; - - MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); - if (MFI->getScratchWaveOffsetReg() == MFI->getStackPtrOffsetReg()) { - assert(!hasFP(MF)); - MFI->setStackPtrOffsetReg(Reg); - } - - MFI->setScratchWaveOffsetReg(Reg); - MFI->setFrameOffsetReg(Reg); - ScratchWaveOffsetReg = Reg; - FPAdjusted = true; - break; + MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); + if (MFI->getScratchWaveOffsetReg() == MFI->getStackPtrOffsetReg()) { + assert(!hasFP(MF)); + MFI->setStackPtrOffsetReg(Reg); } + MFI->setScratchWaveOffsetReg(Reg); + MFI->setFrameOffsetReg(Reg); + return Reg; } } - return std::make_pair(ScratchWaveOffsetReg, FPAdjusted); + return ScratchWaveOffsetReg; } void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); - SIMachineFunctionInfo *MFI = MF.getInfo(); + // FIXME: If we only have SGPR spills, we won't actually be using scratch + // memory since these spill to VGPRs. We should be cleaning up these unused + // SGPR spill frame indices somewhere. - // If we only have SGPR spills, we won't actually be using scratch memory - // since these spill to VGPRs. - // - // FIXME: We should be cleaning up these unused SGPR spill frame indices - // somewhere. + // FIXME: We still have implicit uses on SGPR spill instructions in case they + // need to spill to vector memory. It's likely that will not happen, but at + // this point it appears we need the setup. This part of the prolog should be + // emitted after frame indices are eliminated. + + // FIXME: Remove all of the isPhysRegUsed checks + SIMachineFunctionInfo *MFI = MF.getInfo(); const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); const Function &F = MF.getFunction(); + assert(MFI->isEntryFunction()); + // We need to do the replacement of the private segment buffer and wave offset // register even if there are no stack objects. There could be stores to undef // or a constant without an associated object. + // + // These calls will return `AMDGPU::NoRegister` in cases where there are no + // actual uses of the respective registers. + Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF); + Register ScratchWaveOffsetReg = + getEntryFunctionReservedScratchWaveOffsetReg(MF); - // FIXME: We still have implicit uses on SGPR spill instructions in case they - // need to spill to vector memory. It's likely that will not happen, but at - // this point it appears we need the setup. This part of the prolog should be - // emitted after frame indices are eliminated. - - if (MFI->hasFlatScratchInit()) - emitFlatScratchInit(ST, MF, MBB); + // Make the selected registers live throughout the function. + for (MachineBasicBlock &OtherBB : MF) { + if (&OtherBB == &MBB) + continue; - unsigned ScratchRsrcReg - = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF); + if (ScratchWaveOffsetReg != AMDGPU::NoRegister) + OtherBB.addLiveIn(ScratchWaveOffsetReg); - unsigned ScratchWaveOffsetReg; - bool FPAdjusted; - std::tie(ScratchWaveOffsetReg, FPAdjusted) = - getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); + if (ScratchRsrcReg != AMDGPU::NoRegister) + OtherBB.addLiveIn(ScratchRsrcReg); + } - // We need to insert initialization of the scratch resource descriptor. + // Now that we have fixed the reserved registers we need to locate the + // (potentially) preloaded registers. We should always have a preloaded + // scratch wave offset register, but we only have a preloaded scratch rsrc + // register for HSA. Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - - unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; - if (ST.isAmdHsaOrMesa(F)) { - PreloadedPrivateBufferReg = MFI->getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); - } - - bool OffsetRegUsed = ScratchWaveOffsetReg != AMDGPU::NoRegister && - MRI.isPhysRegUsed(ScratchWaveOffsetReg); - bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister && - MRI.isPhysRegUsed(ScratchRsrcReg); - // FIXME: Hack to not crash in situations which emitted an error. if (PreloadedScratchWaveOffsetReg == AMDGPU::NoRegister) return; @@ -451,72 +444,63 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, MRI.addLiveIn(PreloadedScratchWaveOffsetReg); MBB.addLiveIn(PreloadedScratchWaveOffsetReg); - if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) { - assert(ST.isAmdHsaOrMesa(F) || ST.isMesaGfxShader(F)); - MRI.addLiveIn(PreloadedPrivateBufferReg); - MBB.addLiveIn(PreloadedPrivateBufferReg); - } - - // Make the register selected live throughout the function. - for (MachineBasicBlock &OtherBB : MF) { - if (&OtherBB == &MBB) - continue; - - if (OffsetRegUsed || FPAdjusted) - OtherBB.addLiveIn(ScratchWaveOffsetReg); - - if (ResourceRegUsed) - OtherBB.addLiveIn(ScratchRsrcReg); + Register PreloadedScratchRsrcReg = AMDGPU::NoRegister; + if (ST.isAmdHsaOrMesa(F)) { + PreloadedScratchRsrcReg = + MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); + if (ScratchRsrcReg != AMDGPU::NoRegister && + PreloadedScratchRsrcReg != AMDGPU::NoRegister) { + MRI.addLiveIn(PreloadedScratchRsrcReg); + MBB.addLiveIn(PreloadedScratchRsrcReg); + } } DebugLoc DL; MachineBasicBlock::iterator I = MBB.begin(); - // If we reserved the original input registers, we don't need to copy to the - // reserved registers. + const bool HasFP = hasFP(MF); - bool CopyBuffer = ResourceRegUsed && - PreloadedPrivateBufferReg != AMDGPU::NoRegister && - ST.isAmdHsaOrMesa(F) && - ScratchRsrcReg != PreloadedPrivateBufferReg; + // If we are not HSA or we happened to reserved the original input registers, + // we don't need to copy to the reserved registers. + const bool CopyBuffer = ST.isAmdHsaOrMesa(F) && + ScratchRsrcReg != AMDGPU::NoRegister && + PreloadedScratchRsrcReg != AMDGPU::NoRegister && + ScratchRsrcReg != PreloadedScratchRsrcReg; // This needs to be careful of the copying order to avoid overwriting one of // the input registers before it's been copied to it's final // destination. Usually the offset should be copied first. - bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg, - ScratchWaveOffsetReg); + const bool CopyBufferFirst = + TRI->isSubRegisterEq(PreloadedScratchRsrcReg, ScratchWaveOffsetReg); + if (CopyBuffer && CopyBufferFirst) { BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) - .addReg(PreloadedPrivateBufferReg, RegState::Kill); + .addReg(PreloadedScratchRsrcReg, RegState::Kill); } - unsigned SPReg = MFI->getStackPtrOffsetReg(); - assert(SPReg != AMDGPU::SP_REG); - - // FIXME: Remove the isPhysRegUsed checks - const bool HasFP = hasFP(MF); - - if (HasFP || OffsetRegUsed) { - assert(ScratchWaveOffsetReg); + if (ScratchWaveOffsetReg != AMDGPU::NoRegister) { BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) - .addReg(PreloadedScratchWaveOffsetReg, HasFP ? RegState::Kill : 0); + .addReg(PreloadedScratchWaveOffsetReg, HasFP ? RegState::Kill : 0); } if (CopyBuffer && !CopyBufferFirst) { BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) - .addReg(PreloadedPrivateBufferReg, RegState::Kill); + .addReg(PreloadedScratchRsrcReg, RegState::Kill); } - if (ResourceRegUsed) { - emitEntryFunctionScratchSetup(ST, MF, MBB, MFI, I, - PreloadedPrivateBufferReg, ScratchRsrcReg); + // FIXME: This should also implement the setup path for HSA. + if (ScratchRsrcReg != AMDGPU::NoRegister) { + emitEntryFunctionScratchRsrcRegSetup( + MF, MBB, I, DL, PreloadedScratchRsrcReg, ScratchRsrcReg); } if (HasFP) { - DebugLoc DL; const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); int64_t StackSize = FrameInfo.getStackSize(); + Register SPReg = MFI->getStackPtrOffsetReg(); + assert(SPReg != AMDGPU::SP_REG); + // On kernel entry, the private scratch wave offset is the SP value. if (StackSize == 0) { BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), SPReg) @@ -527,18 +511,24 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, .addImm(StackSize * ST.getWavefrontSize()); } } + + if (MFI->hasFlatScratchInit()) { + emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, + MFI->getScratchWaveOffsetReg()); + } } -// Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set. -void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, - MachineFunction &MF, MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI, - MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg, - unsigned ScratchRsrcReg) const { +// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoRegister` +void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( + MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + const DebugLoc &DL, Register PreloadedScratchRsrcReg, + Register ScratchRsrcReg) const { + const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); const Function &Fn = MF.getFunction(); - DebugLoc DL; if (ST.isAmdPalOS()) { // The pointer to the GIT is formed from the offset passed in and either @@ -607,11 +597,8 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, .addReg(Rsrc03 ) .addImm(0xffdfffff); } - - return; - } - if (ST.isMesaGfxShader(Fn) - || (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) { + } else if (ST.isMesaGfxShader(Fn) || + (PreloadedScratchRsrcReg == AMDGPU::NoRegister)) { assert(!ST.isAmdHsaOrMesa(Fn)); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h index 8dd09726f6d2dc..46525e9b1fbbe9 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -55,26 +55,23 @@ class SIFrameLowering final : public AMDGPUFrameLowering { MachineBasicBlock::iterator MI) const override; private: - void emitFlatScratchInit(const GCNSubtarget &ST, - MachineFunction &MF, - MachineBasicBlock &MBB) const; - - unsigned getReservedPrivateSegmentBufferReg( - const GCNSubtarget &ST, - const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, - MachineFunction &MF) const; - - std::pair getReservedPrivateSegmentWaveByteOffsetReg( - const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, MachineFunction &MF) const; - - // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set. - void emitEntryFunctionScratchSetup(const GCNSubtarget &ST, MachineFunction &MF, - MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI, - MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg, - unsigned ScratchRsrcReg) const; + void emitEntryFunctionFlatScratchInit(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, + Register ScratchWaveOffsetReg) const; + + Register getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF) const; + + Register + getEntryFunctionReservedScratchWaveOffsetReg(MachineFunction &MF) const; + + void emitEntryFunctionScratchRsrcRegSetup(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, + Register PreloadedPrivateBufferReg, + Register ScratchRsrcReg) const; public: bool hasFP(const MachineFunction &MF) const override; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 5562e118b4c8d0..b9e825fcfc4491 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2446,21 +2446,20 @@ void SITargetLowering::passSpecialInputs( if (!CLI.CS) return; - const Function *CalleeFunc = CLI.CS.getCalledFunction(); - assert(CalleeFunc); - SelectionDAG &DAG = CLI.DAG; const SDLoc &DL = CLI.DL; const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); - - auto &ArgUsageInfo = - DAG.getPass()->getAnalysis(); - const AMDGPUFunctionArgInfo &CalleeArgInfo - = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc); - const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo(); + const AMDGPUFunctionArgInfo *CalleeArgInfo + = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; + if (const Function *CalleeFunc = CLI.CS.getCalledFunction()) { + auto &ArgUsageInfo = + DAG.getPass()->getAnalysis(); + CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc); + } + // TODO: Unify with private memory register handling. This is complicated by // the fact that at least in kernels, the input argument is not necessarily // in the same location as the input. @@ -2478,7 +2477,7 @@ void SITargetLowering::passSpecialInputs( const ArgDescriptor *OutgoingArg; const TargetRegisterClass *ArgRC; - std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID); + std::tie(OutgoingArg, ArgRC) = CalleeArgInfo->getPreloadedValue(InputID); if (!OutgoingArg) continue; @@ -2519,13 +2518,13 @@ void SITargetLowering::passSpecialInputs( const TargetRegisterClass *ArgRC; std::tie(OutgoingArg, ArgRC) = - CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); + CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); if (!OutgoingArg) std::tie(OutgoingArg, ArgRC) = - CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); + CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); if (!OutgoingArg) std::tie(OutgoingArg, ArgRC) = - CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); + CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); if (!OutgoingArg) return; @@ -2540,10 +2539,10 @@ void SITargetLowering::passSpecialInputs( SDLoc SL; // If incoming ids are not packed we need to pack them. - if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX) + if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX) InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX); - if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY) { + if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) { SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY); Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y, DAG.getShiftAmountConstant(10, MVT::i32, SL)); @@ -2551,7 +2550,7 @@ void SITargetLowering::passSpecialInputs( DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y; } - if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ) { + if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) { SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ); Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z, DAG.getShiftAmountConstant(20, MVT::i32, SL)); @@ -2709,7 +2708,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, if (!CLI.CS.getInstruction()) report_fatal_error("unsupported libcall legalization"); - if (!CLI.CS.getCalledFunction()) { + if (!AMDGPUTargetMachine::EnableFixedFunctionABI && !CLI.CS.getCalledFunction()) { return lowerUnhandledCall(CLI, InVals, "unsupported indirect call to function "); } @@ -2938,9 +2937,12 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, Ops.push_back(Callee); // Add a redundant copy of the callee global which will not be legalized, as // we need direct access to the callee later. - GlobalAddressSDNode *GSD = cast(Callee); - const GlobalValue *GV = GSD->getGlobal(); - Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64)); + if (GlobalAddressSDNode *GSD = dyn_cast(Callee)) { + const GlobalValue *GV = GSD->getGlobal(); + Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64)); + } else { + Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); + } if (IsTailCall) { // Each tail call may have to adjust the stack by a different amount, so diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 20a028e555572c..efa17770dd5865 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -472,8 +472,8 @@ def SI_RETURN : SPseudoInstSI < // Return for returning function calls without output register. // -// This version is only needed so we can fill in the output regiter in -// the custom inserter. +// This version is only needed so we can fill in the output register +// in the custom inserter. def SI_CALL_ISEL : SPseudoInstSI < (outs), (ins SSrc_b64:$src0, unknown:$callee), [(AMDGPUcall i64:$src0, tglobaladdr:$callee)]> { @@ -485,6 +485,11 @@ def SI_CALL_ISEL : SPseudoInstSI < let isConvergent = 1; } +def : GCNPat< + (AMDGPUcall i64:$src0, (i64 0)), + (SI_CALL_ISEL $src0, (i64 0)) +>; + // Wrapper around s_swappc_b64 with extra $callee parameter to track // the called function after regalloc. def SI_CALL : SPseudoInstSI < diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 45a88a686dab6b..2e6bcb550999dd 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -17079,7 +17079,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::arm_mve_vld4q: { Info.opc = ISD::INTRINSIC_W_CHAIN; // Conservatively set memVT to the entire set of vectors loaded. - Type *VecTy = cast(I.getType())->getTypeAtIndex(1); + Type *VecTy = cast(I.getType())->getElementType(1); unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4; Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2); Info.ptrVal = I.getArgOperand(0); diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index d51807501d0dd2..58164e57ab818a 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -4210,11 +4210,32 @@ class MVE_VQxDMLxDH { + def "": MVE_VQxDMLxDH; + defvar Inst = !cast(NAME); + defvar ConstParams = (? (i32 exch), (i32 round), (i32 subtract)); + defvar unpred_intr = int_arm_mve_vqdmlad; + defvar pred_intr = int_arm_mve_vqdmlad_predicated; + + def : Pat<(VTI.Vec !con((unpred_intr (VTI.Vec MQPR:$a), (VTI.Vec MQPR:$b), + (VTI.Vec MQPR:$c)), ConstParams)), + (VTI.Vec (Inst (VTI.Vec MQPR:$a), (VTI.Vec MQPR:$b), + (VTI.Vec MQPR:$c)))>; + def : Pat<(VTI.Vec !con((pred_intr (VTI.Vec MQPR:$a), (VTI.Vec MQPR:$b), + (VTI.Vec MQPR:$c)), ConstParams, + (? (VTI.Pred VCCR:$pred)))), + (VTI.Vec (Inst (VTI.Vec MQPR:$a), (VTI.Vec MQPR:$b), + (VTI.Vec MQPR:$c), + ARMVCCThen, (VTI.Pred VCCR:$pred)))>; +} + multiclass MVE_VQxDMLxDH_multi { - def s8 : MVE_VQxDMLxDH; - def s16 : MVE_VQxDMLxDH; - def s32 : MVE_VQxDMLxDH; + defm s8 : MVE_VQxDMLxDH_p; + defm s16 : MVE_VQxDMLxDH_p; + defm s32 : MVE_VQxDMLxDH_p; } defm MVE_VQDMLADH : MVE_VQxDMLxDH_multi<"vqdmladh", 0b0, 0b0, 0b0>; diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp index 84876eda33a6f5..63aa65267ef266 100644 --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -243,6 +243,9 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT, this->Options.NoTrapAfterNoreturn = true; } + // ARM supports the debug entry values. + setSupportsDebugEntryValues(true); + initAsmInfo(); } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 2d718011059ab4..e9470907c289e9 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -15368,22 +15368,48 @@ bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const { - VT = VT.getScalarType(); - - if (!VT.isSimple()) - return false; + return isFMAFasterThanFMulAndFAdd( + MF.getFunction(), VT.getTypeForEVT(MF.getFunction().getContext())); +} - switch (VT.getSimpleVT().SimpleTy) { - case MVT::f32: - case MVT::f64: +bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F, + Type *Ty) const { + switch (Ty->getScalarType()->getTypeID()) { + case Type::FloatTyID: + case Type::DoubleTyID: return true; - case MVT::f128: - return (EnableQuadPrecision && Subtarget.hasP9Vector()); + case Type::FP128TyID: + return EnableQuadPrecision && Subtarget.hasP9Vector(); default: - break; + return false; } +} - return false; +// Currently this is a copy from AArch64TargetLowering::isProfitableToHoist. +// FIXME: add more patterns which are profitable to hoist. +bool PPCTargetLowering::isProfitableToHoist(Instruction *I) const { + if (I->getOpcode() != Instruction::FMul) + return true; + + if (!I->hasOneUse()) + return true; + + Instruction *User = I->user_back(); + assert(User && "A single use instruction with no uses."); + + if (User->getOpcode() != Instruction::FSub && + User->getOpcode() == Instruction::FAdd) + return true; + + const TargetOptions &Options = getTargetMachine().Options; + const Function *F = I->getFunction(); + const DataLayout &DL = F->getParent()->getDataLayout(); + Type *Ty = User->getOperand(0)->getType(); + + return !( + isFMAFasterThanFMulAndFAdd(*F, Ty) && + isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) && + (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath)); } const MCPhysReg * diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index c0a0f9a79a3abf..70bf4fbfce1d18 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -907,6 +907,14 @@ namespace llvm { bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override; + bool isFMAFasterThanFMulAndFAdd(const Function &F, Type *Ty) const override; + + /// isProfitableToHoist - Check if it is profitable to hoist instruction + /// \p I to its dominator block. + /// For example, it is not profitable if \p I and it's only user can form a + /// FMA instruction, because Powerpc prefers FMADD. + bool isProfitableToHoist(Instruction *I) const override; + const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; // Should we expand the build vector with shuffles? diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 1b3c52b4edf0c5..f7a68dd162297e 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -2879,7 +2879,7 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI, APInt InVal((Opc == PPC::RLDICL || Opc == PPC::RLDICL_rec) ? 64 : 32, SExtImm, true); InVal = InVal.rotl(SH); - uint64_t Mask = (1LLU << (63 - MB + 1)) - 1; + uint64_t Mask = MB == 0 ? -1LLU : (1LLU << (63 - MB + 1)) - 1; InVal &= Mask; // Can't replace negative values with an LI as that will sign-extend // and not clear the left bits. If we're setting the CR bit, we will use diff --git a/llvm/lib/Target/RISCV/RISCVRegisterBankInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterBankInfo.cpp index bd3b95a98b9f78..9db3107da0733f 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterBankInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterBankInfo.cpp @@ -22,5 +22,5 @@ using namespace llvm; -RISCVRegisterBankInfo::RISCVRegisterBankInfo(const TargetRegisterInfo &TRI) - : RISCVGenRegisterBankInfo() {} +RISCVRegisterBankInfo::RISCVRegisterBankInfo(unsigned HwMode) + : RISCVGenRegisterBankInfo(HwMode) {} diff --git a/llvm/lib/Target/RISCV/RISCVRegisterBankInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterBankInfo.h index 05fac992734d99..71dddd28380dec 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterBankInfo.h +++ b/llvm/lib/Target/RISCV/RISCVRegisterBankInfo.h @@ -31,7 +31,7 @@ class RISCVGenRegisterBankInfo : public RegisterBankInfo { /// This class provides the information for the target register banks. class RISCVRegisterBankInfo final : public RISCVGenRegisterBankInfo { public: - RISCVRegisterBankInfo(const TargetRegisterInfo &TRI); + RISCVRegisterBankInfo(unsigned HwMode); }; } // end namespace llvm #endif diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp index 47a48c820a290c..9815a785268906 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp @@ -56,7 +56,7 @@ RISCVSubtarget::RISCVSubtarget(const Triple &TT, StringRef CPU, StringRef FS, CallLoweringInfo.reset(new RISCVCallLowering(*getTargetLowering())); Legalizer.reset(new RISCVLegalizerInfo(*this)); - auto *RBI = new RISCVRegisterBankInfo(*getRegisterInfo()); + auto *RBI = new RISCVRegisterBankInfo(getHwMode()); RegBankInfo.reset(RBI); InstSelector.reset(createRISCVInstructionSelector( *static_cast(&TM), *this, *RBI)); diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 22b4e2805a5ea3..0cfa7bb04771ef 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -232,6 +232,9 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT, setMachineOutliner(true); + // x86 supports the debug entry values. + setSupportsDebugEntryValues(true); + initAsmInfo(); } diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index a03c75bb7dda2d..8ae5749cd06cbf 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2780,7 +2780,6 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, // FIXME: These assume a naive kshift+binop lowering, which is probably // conservative in most cases. - // FIXME: This doesn't cost large types like v128i1 correctly. static const CostTblEntry AVX512BoolReduction[] = { { ISD::AND, MVT::v2i1, 3 }, { ISD::AND, MVT::v4i1, 5 }, @@ -2827,18 +2826,28 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, // Handle bool allof/anyof patterns. if (!IsPairwise && ValTy->getVectorElementType()->isIntegerTy(1)) { + unsigned ArithmeticCost = 0; + if (MTy.isVector() && + MTy.getVectorNumElements() < ValTy->getVectorNumElements()) { + // Type needs to be split. We need LT.first - 1 arithmetic ops. + Type *SingleOpTy = VectorType::get(ValTy->getVectorElementType(), + MTy.getVectorNumElements()); + ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy); + ArithmeticCost *= LT.first - 1; + } + if (ST->hasAVX512()) if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy)) - return LT.first * Entry->Cost; + return ArithmeticCost + Entry->Cost; if (ST->hasAVX2()) if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy)) - return LT.first * Entry->Cost; + return ArithmeticCost + Entry->Cost; if (ST->hasAVX()) if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy)) - return LT.first * Entry->Cost; + return ArithmeticCost + Entry->Cost; if (ST->hasSSE2()) if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy)) - return LT.first * Entry->Cost; + return ArithmeticCost + Entry->Cost; } return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise); diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index 3be31bcd172be5..d718574a81c8ca 100644 --- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -295,7 +295,7 @@ doPromotion(Function *F, SmallPtrSetImpl &ArgsToPromote, if (auto *ElPTy = dyn_cast(ElTy)) ElTy = ElPTy->getElementType(); else - ElTy = cast(ElTy)->getTypeAtIndex(II); + ElTy = GetElementPtrInst::getTypeAtIndex(ElTy, II); } // And create a GEP to extract those indices. V = IRB.CreateGEP(ArgIndex.first, V, Ops, V->getName() + ".idx"); @@ -784,7 +784,7 @@ bool ArgumentPromotionPass::isDenselyPacked(Type *type, const DataLayout &DL) { if (DL.getTypeSizeInBits(type) != DL.getTypeAllocSizeInBits(type)) return false; - if (!isa(type)) + if (!isa(type) && !isa(type)) return true; // For homogenous sequential types, check for padding within members. diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index ff746ad0cbb3a2..3b234ca0be7d36 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -142,7 +142,7 @@ static bool isLeakCheckerRoot(GlobalVariable *GV) { E = STy->element_end(); I != E; ++I) { Type *InnerTy = *I; if (isa(InnerTy)) return true; - if (isa(InnerTy)) + if (isa(InnerTy) || isa(InnerTy)) Types.push_back(InnerTy); } break; diff --git a/llvm/lib/Transforms/IPO/StripSymbols.cpp b/llvm/lib/Transforms/IPO/StripSymbols.cpp index 6ce00714523b30..088091df770f9a 100644 --- a/llvm/lib/Transforms/IPO/StripSymbols.cpp +++ b/llvm/lib/Transforms/IPO/StripSymbols.cpp @@ -147,10 +147,12 @@ static void RemoveDeadConstant(Constant *C) { if (GlobalVariable *GV = dyn_cast(C)) { if (!GV->hasLocalLinkage()) return; // Don't delete non-static globals. GV->eraseFromParent(); - } - else if (!isa(C)) - if (isa(C->getType())) + } else if (!isa(C)) { + // FIXME: Why does the type of the constant matter here? + if (isa(C->getType()) || isa(C->getType()) || + isa(C->getType())) C->destroyConstant(); + } // If the constant referenced anything, see if we can delete it as well. for (Constant *O : Operands) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 2138017606b799..38c7a95f90dd27 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -296,78 +296,116 @@ static Value *simplifyX86immShift(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder) { bool LogicalShift = false; bool ShiftLeft = false; + bool IsImm = false; switch (II.getIntrinsicID()) { default: llvm_unreachable("Unexpected intrinsic!"); - case Intrinsic::x86_sse2_psra_d: - case Intrinsic::x86_sse2_psra_w: case Intrinsic::x86_sse2_psrai_d: case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_avx2_psra_d: - case Intrinsic::x86_avx2_psra_w: case Intrinsic::x86_avx2_psrai_d: case Intrinsic::x86_avx2_psrai_w: - case Intrinsic::x86_avx512_psra_q_128: case Intrinsic::x86_avx512_psrai_q_128: - case Intrinsic::x86_avx512_psra_q_256: case Intrinsic::x86_avx512_psrai_q_256: - case Intrinsic::x86_avx512_psra_d_512: - case Intrinsic::x86_avx512_psra_q_512: - case Intrinsic::x86_avx512_psra_w_512: case Intrinsic::x86_avx512_psrai_d_512: case Intrinsic::x86_avx512_psrai_q_512: case Intrinsic::x86_avx512_psrai_w_512: - LogicalShift = false; ShiftLeft = false; + IsImm = true; + LLVM_FALLTHROUGH; + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_avx2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx512_psra_q_128: + case Intrinsic::x86_avx512_psra_q_256: + case Intrinsic::x86_avx512_psra_d_512: + case Intrinsic::x86_avx512_psra_q_512: + case Intrinsic::x86_avx512_psra_w_512: + LogicalShift = false; + ShiftLeft = false; break; - case Intrinsic::x86_sse2_psrl_d: - case Intrinsic::x86_sse2_psrl_q: - case Intrinsic::x86_sse2_psrl_w: case Intrinsic::x86_sse2_psrli_d: case Intrinsic::x86_sse2_psrli_q: case Intrinsic::x86_sse2_psrli_w: - case Intrinsic::x86_avx2_psrl_d: - case Intrinsic::x86_avx2_psrl_q: - case Intrinsic::x86_avx2_psrl_w: case Intrinsic::x86_avx2_psrli_d: case Intrinsic::x86_avx2_psrli_q: case Intrinsic::x86_avx2_psrli_w: - case Intrinsic::x86_avx512_psrl_d_512: - case Intrinsic::x86_avx512_psrl_q_512: - case Intrinsic::x86_avx512_psrl_w_512: case Intrinsic::x86_avx512_psrli_d_512: case Intrinsic::x86_avx512_psrli_q_512: case Intrinsic::x86_avx512_psrli_w_512: - LogicalShift = true; ShiftLeft = false; + IsImm = true; + LLVM_FALLTHROUGH; + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx512_psrl_d_512: + case Intrinsic::x86_avx512_psrl_q_512: + case Intrinsic::x86_avx512_psrl_w_512: + LogicalShift = true; + ShiftLeft = false; break; - case Intrinsic::x86_sse2_psll_d: - case Intrinsic::x86_sse2_psll_q: - case Intrinsic::x86_sse2_psll_w: case Intrinsic::x86_sse2_pslli_d: case Intrinsic::x86_sse2_pslli_q: case Intrinsic::x86_sse2_pslli_w: - case Intrinsic::x86_avx2_psll_d: - case Intrinsic::x86_avx2_psll_q: - case Intrinsic::x86_avx2_psll_w: case Intrinsic::x86_avx2_pslli_d: case Intrinsic::x86_avx2_pslli_q: case Intrinsic::x86_avx2_pslli_w: - case Intrinsic::x86_avx512_psll_d_512: - case Intrinsic::x86_avx512_psll_q_512: - case Intrinsic::x86_avx512_psll_w_512: case Intrinsic::x86_avx512_pslli_d_512: case Intrinsic::x86_avx512_pslli_q_512: case Intrinsic::x86_avx512_pslli_w_512: - LogicalShift = true; ShiftLeft = true; + IsImm = true; + LLVM_FALLTHROUGH; + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx512_psll_d_512: + case Intrinsic::x86_avx512_psll_q_512: + case Intrinsic::x86_avx512_psll_w_512: + LogicalShift = true; + ShiftLeft = true; break; } assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); - // Simplify if count is constant. - auto Arg1 = II.getArgOperand(1); - auto CAZ = dyn_cast(Arg1); - auto CDV = dyn_cast(Arg1); - auto CInt = dyn_cast(Arg1); - if (!CAZ && !CDV && !CInt) + auto Vec = II.getArgOperand(0); + auto Amt = II.getArgOperand(1); + auto VT = cast(Vec->getType()); + auto SVT = VT->getElementType(); + unsigned VWidth = VT->getNumElements(); + unsigned BitWidth = SVT->getPrimitiveSizeInBits(); + + // If the shift amount is guaranteed to be in-range we can replace it with a + // generic shift. If its guaranteed to be out of range, logical shifts combine to + // zero and arithmetic shifts are clamped to (BitWidth - 1). + if (IsImm) { + assert(Amt->getType()->isIntegerTy(32) && + "Unexpected shift-by-immediate type"); + KnownBits KnownAmtBits = + llvm::computeKnownBits(Amt, II.getModule()->getDataLayout()); + if (KnownAmtBits.getMaxValue().ult(BitWidth)) { + Amt = Builder.CreateZExtOrTrunc(Amt, SVT); + Amt = Builder.CreateVectorSplat(VWidth, Amt); + return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) + : Builder.CreateLShr(Vec, Amt)) + : Builder.CreateAShr(Vec, Amt)); + } + if (KnownAmtBits.getMinValue().uge(BitWidth)) { + if (LogicalShift) + return ConstantAggregateZero::get(VT); + Amt = ConstantInt::get(SVT, BitWidth - 1); + return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt)); + } + } + + // Simplify if count is constant vector. + auto CAZ = dyn_cast(Amt); + auto CDV = dyn_cast(Amt); + if (!CAZ && !CDV) return nullptr; APInt Count(64, 0); @@ -387,14 +425,6 @@ static Value *simplifyX86immShift(const IntrinsicInst &II, Count |= SubElt->getValue().zextOrTrunc(64); } } - else if (CInt) - Count = CInt->getValue(); - - auto Vec = II.getArgOperand(0); - auto VT = cast(Vec->getType()); - auto SVT = VT->getElementType(); - unsigned VWidth = VT->getNumElements(); - unsigned BitWidth = SVT->getPrimitiveSizeInBits(); // If shift-by-zero then just return the original value. if (Count.isNullValue()) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 0b1d9e8df03925..afdddad10cea22 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -2421,10 +2421,8 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { // to a getelementptr X, 0, 0, 0... turn it into the appropriate gep. // This can enhance SROA and other transforms that want type-safe pointers. unsigned NumZeros = 0; - while (SrcElTy != DstElTy && - isa(SrcElTy) && !SrcElTy->isPointerTy() && - SrcElTy->getNumContainedTypes() /* not "{}" */) { - SrcElTy = cast(SrcElTy)->getTypeAtIndex(0U); + while (SrcElTy && SrcElTy != DstElTy) { + SrcElTy = GetElementPtrInst::getTypeAtIndex(SrcElTy, (uint64_t)0); ++NumZeros; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 4db482646ab2dc..b95f3f6a0ecdcb 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -348,7 +348,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { // Move all alloca's of zero byte objects to the entry block and merge them // together. Note that we only do this for alloca's, because malloc should // allocate and return a unique pointer, even for a zero byte allocation. - if (DL.getTypeAllocSize(AI.getAllocatedType()) == 0) { + if (DL.getTypeAllocSize(AI.getAllocatedType()).getKnownMinSize() == 0) { // For a zero sized alloca there is no point in doing an array allocation. // This is helpful if the array size is a complicated expression not used // elsewhere. @@ -365,7 +365,8 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { // dominance as the array size was forced to a constant earlier already. AllocaInst *EntryAI = dyn_cast(FirstInst); if (!EntryAI || !EntryAI->getAllocatedType()->isSized() || - DL.getTypeAllocSize(EntryAI->getAllocatedType()) != 0) { + DL.getTypeAllocSize(EntryAI->getAllocatedType()) + .getKnownMinSize() != 0) { AI.moveBefore(FirstInst); return &AI; } diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index e0063eb24515fa..9d17e92eca203a 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1961,10 +1961,9 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { if (J > 0) { if (J == 1) { CurTy = Op1->getSourceElementType(); - } else if (auto *CT = dyn_cast(CurTy)) { - CurTy = CT->getTypeAtIndex(Op1->getOperand(J)); } else { - CurTy = nullptr; + CurTy = + GetElementPtrInst::getTypeAtIndex(CurTy, Op1->getOperand(J)); } } } diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index 5efd3ffc2680c8..27ddb28aaa461e 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -181,8 +181,8 @@ class LowerMatrixIntrinsics { void setColumn(unsigned i, Value *V) { Columns[i] = V; } - size_t getNumColumns() const { return Columns.size(); } - size_t getNumRows() const { + unsigned getNumColumns() const { return Columns.size(); } + unsigned getNumRows() const { assert(Columns.size() > 0 && "Cannot call getNumRows without columns"); return cast(Columns[0]->getType())->getNumElements(); } @@ -634,10 +634,11 @@ class LowerMatrixIntrinsics { return true; } - void LowerLoad(Instruction *Inst, Value *Ptr, Value *Stride, - ShapeInfo Shape) { - IRBuilder<> Builder(Inst); - auto VType = cast(Inst->getType()); + /// Load a matrix with \p Shape starting at \p Ptr and using \p Stride between + /// columns. + ColumnMatrixTy loadMatrix(Type *Ty, Value *Ptr, Value *Stride, + ShapeInfo Shape, IRBuilder<> &Builder) { + auto VType = cast(Ty); Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); ColumnMatrixTy Result; // Distance between start of one column and the start of the next @@ -648,10 +649,41 @@ class LowerMatrixIntrinsics { Value *Column = createColumnLoad(GEP, VType->getElementType(), Builder); Result.addColumn(Column); } + return Result.addNumLoads(getNumOps(Result.getColumnTy()) * + Result.getNumColumns()); + } + + /// Loads a sub-matrix with shape \p ResultShape from a \p R x \p C matrix, + /// starting at \p MatrixPtr[I][J]. + ColumnMatrixTy loadMatrix(Value *MatrixPtr, ShapeInfo MatrixShape, unsigned I, + unsigned J, ShapeInfo ResultShape, Type *EltTy, + IRBuilder<> &Builder) { + + Value *Offset = Builder.CreateAdd( + Builder.CreateMul(Builder.getInt32(J), + Builder.getInt32(MatrixShape.NumRows)), + Builder.getInt32(I)); + + unsigned AS = cast(MatrixPtr->getType())->getAddressSpace(); + Value *EltPtr = + Builder.CreatePointerCast(MatrixPtr, PointerType::get(EltTy, AS)); + Value *TileStart = Builder.CreateGEP(EltTy, EltPtr, Offset); + Type *TileTy = + VectorType::get(EltTy, ResultShape.NumRows * ResultShape.NumColumns); + Type *TilePtrTy = PointerType::get(TileTy, AS); + Value *TilePtr = + Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast"); + + return loadMatrix(TileTy, TilePtr, Builder.getInt32(ResultShape.NumRows), + ResultShape, Builder); + } + /// Lower a load instruction with shape information. + void LowerLoad(Instruction *Inst, Value *Ptr, Value *Stride, + ShapeInfo Shape) { + IRBuilder<> Builder(Inst); finalizeLowering(Inst, - Result.addNumLoads(getNumOps(Result.getColumnTy()) * - Result.getNumColumns()), + loadMatrix(Inst->getType(), Ptr, Stride, Shape, Builder), Builder); } @@ -665,22 +697,54 @@ class LowerMatrixIntrinsics { {Inst->getArgOperand(2), Inst->getArgOperand(3)}); } - void LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr, Value *Stride, - ShapeInfo Shape) { - IRBuilder<> Builder(Inst); - auto VType = cast(Matrix->getType()); + /// Stores a sub-matrix \p StoreVal into the \p R x \p C matrix starting at \p + /// MatrixPtr[I][J]. + void storeMatrix(const ColumnMatrixTy &StoreVal, Value *MatrixPtr, + ShapeInfo MatrixShape, unsigned I, unsigned J, Type *EltTy, + IRBuilder<> &Builder) { + Value *Offset = Builder.CreateAdd( + Builder.CreateMul(Builder.getInt32(J), + Builder.getInt32(MatrixShape.NumRows)), + Builder.getInt32(I)); + + unsigned AS = cast(MatrixPtr->getType())->getAddressSpace(); + Value *EltPtr = + Builder.CreatePointerCast(MatrixPtr, PointerType::get(EltTy, AS)); + Value *TileStart = Builder.CreateGEP(EltTy, EltPtr, Offset); + Type *TileTy = VectorType::get(EltTy, StoreVal.getNumRows() * + StoreVal.getNumColumns()); + Type *TilePtrTy = PointerType::get(TileTy, AS); + Value *TilePtr = + Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast"); + + storeMatrix(TileTy, StoreVal, TilePtr, + Builder.getInt32(StoreVal.getNumRows()), Builder); + } + + /// Store matrix \p StoreVal starting at \p Ptr and using \p Stride between + /// columns. + ColumnMatrixTy storeMatrix(Type *Ty, ColumnMatrixTy StoreVal, Value *Ptr, + Value *Stride, IRBuilder<> &Builder) { + auto VType = cast(Ty); Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); - auto LM = getMatrix(Matrix, Shape, Builder); - for (auto C : enumerate(LM.columns())) { - Value *GEP = - computeColumnAddr(EltPtr, Builder.getInt32(C.index()), Stride, - Shape.NumRows, VType->getElementType(), Builder); + for (auto C : enumerate(StoreVal.columns())) { + Value *GEP = computeColumnAddr(EltPtr, Builder.getInt32(C.index()), + Stride, StoreVal.getNumRows(), + VType->getElementType(), Builder); createColumnStore(C.value(), GEP, VType->getElementType(), Builder); } - Inst2ColumnMatrix[Inst] = ColumnMatrixTy().addNumStores( - getNumOps(LM.getColumnTy()) * LM.getNumColumns()); + return ColumnMatrixTy().addNumStores(getNumOps(StoreVal.getColumnTy()) * + StoreVal.getNumColumns()); + } - ToRemove.push_back(Inst); + /// Lower a store instruction with shape information. + void LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr, Value *Stride, + ShapeInfo Shape) { + IRBuilder<> Builder(Inst); + auto StoreVal = getMatrix(Matrix, Shape, Builder); + finalizeLowering( + Inst, storeMatrix(Matrix->getType(), StoreVal, Ptr, Stride, Builder), + Builder); } /// Lowers llvm.matrix.columnwise.store. diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp index c17c35b7e32321..fcd7ed195291a2 100644 --- a/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -739,9 +739,8 @@ void SCCPSolver::visitPHINode(PHINode &PN) { if (PN.getType()->isStructTy()) return (void)markOverdefined(&PN); - if (isOverdefined(getValueState(&PN))) { - return (void)markOverdefined(&PN); - } + if (getValueState(&PN).isOverdefined()) + return; // Quick exit // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant, // and slow us down a lot. Just mark them overdefined. @@ -753,38 +752,19 @@ void SCCPSolver::visitPHINode(PHINode &PN) { // constant, and they agree with each other, the PHI becomes the identical // constant. If they are constant and don't agree, the PHI is overdefined. // If there are no executable operands, the PHI remains unknown. - Constant *OperandVal = nullptr; + bool Changed = false; for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) { LatticeVal IV = getValueState(PN.getIncomingValue(i)); - if (IV.isUnknownOrUndef()) continue; // Doesn't influence PHI node. - if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent())) continue; - if (isOverdefined(IV)) // PHI node becomes overdefined! - return (void)markOverdefined(&PN); - - if (!OperandVal) { // Grab the first value. - OperandVal = getConstant(IV); - continue; - } - - // There is already a reachable operand. If we conflict with it, - // then the PHI node becomes overdefined. If we agree with it, we - // can continue on. - - // Check to see if there are two different constants merging, if so, the PHI - // node is overdefined. - if (getConstant(IV) != OperandVal) - return (void)markOverdefined(&PN); + LatticeVal &Res = getValueState(&PN); + Changed |= Res.mergeIn(IV, DL); + if (Res.isOverdefined()) + break; } - - // If we exited the loop, this means that the PHI node only has constant - // arguments that agree with each other(and OperandVal is the constant) or - // OperandVal is null because there are no defined incoming arguments. If - // this is the case, the PHI remains unknown. - if (OperandVal) - markConstant(&PN, OperandVal); // Acquire operand value + if (Changed) + pushToWorkListMsg(ValueState[&PN], &PN); } void SCCPSolver::visitReturnInst(ReturnInst &I) { @@ -977,9 +957,18 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) { LatticeVal V2State = getValueState(I.getOperand(1)); LatticeVal &IV = ValueState[&I]; - if (isOverdefined(IV)) + if (IV.isOverdefined()) + return; + + // If something is undef, wait for it to resolve. + if (V1State.isUnknownOrUndef() || V2State.isUnknownOrUndef()) + return; + + if (V1State.isOverdefined() && V2State.isOverdefined()) return (void)markOverdefined(&I); + // Both operands are non-integer constants or constant expressions. + // TODO: Use information from notconstant better. if (isConstant(V1State) && isConstant(V2State)) { Constant *C = ConstantExpr::get(I.getOpcode(), getConstant(V1State), getConstant(V2State)); @@ -989,50 +978,21 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) { return (void)markConstant(IV, &I, C); } - // If something is undef, wait for it to resolve. - if (V1State.isUnknownOrUndef() || V2State.isUnknownOrUndef()) - return; + // Operands are either constant ranges, notconstant, overdefined or one of the + // operands is a constant. + ConstantRange A = ConstantRange::getFull(I.getType()->getScalarSizeInBits()); + ConstantRange B = ConstantRange::getFull(I.getType()->getScalarSizeInBits()); + if (V1State.isConstantRange()) + A = V1State.getConstantRange(); + if (V2State.isConstantRange()) + B = V2State.getConstantRange(); - // Otherwise, one of our operands is overdefined. Try to produce something - // better than overdefined with some tricks. - // If this is 0 / Y, it doesn't matter that the second operand is - // overdefined, and we can replace it with zero. - if (I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv) - if (isConstant(V1State) && getConstant(V1State)->isNullValue()) - return (void)markConstant(IV, &I, getConstant(V1State)); - - // If this is: - // -> AND/MUL with 0 - // -> OR with -1 - // it doesn't matter that the other operand is overdefined. - if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Mul || - I.getOpcode() == Instruction::Or) { - LatticeVal *NonOverdefVal = nullptr; - if (!isOverdefined(V1State)) - NonOverdefVal = &V1State; - - else if (!isOverdefined(V2State)) - NonOverdefVal = &V2State; - if (NonOverdefVal) { - if (!isConstant(*NonOverdefVal)) - return; - - if (I.getOpcode() == Instruction::And || - I.getOpcode() == Instruction::Mul) { - // X and 0 = 0 - // X * 0 = 0 - if (getConstant(*NonOverdefVal)->isNullValue()) - return (void)markConstant(IV, &I, getConstant(*NonOverdefVal)); - } else { - // X or -1 = -1 - if (ConstantInt *CI = getConstantInt(*NonOverdefVal)) - if (CI->isMinusOne()) - return (void)markConstant(IV, &I, CI); - } - } - } + ConstantRange R = A.binaryOp(cast(&I)->getOpcode(), B); + mergeInValue(&I, LatticeVal::getRange(R)); - markOverdefined(&I); + // TODO: Currently we do not exploit special values that produce something + // better than overdefined with an overdefined operand for vector or floating + // point types, like and <4 x i32> overdefined, zeroinitializer. } // Handle ICmpInst instruction. diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e411c4ece83d9d..377aa78730b047 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3131,7 +3131,7 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { unsigned N = 1; Type *EltTy = T; - while (isa(EltTy)) { + while (isa(EltTy) || isa(EltTy)) { if (auto *ST = dyn_cast(EltTy)) { // Check that struct is homogeneous. for (const auto *Ty : ST->elements()) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 45be91baec8d92..28463943bfa3e8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -404,8 +404,10 @@ void VPInstruction::print(raw_ostream &O) const { } void VPInstruction::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { - printAsOperand(O, SlotTracker); - O << " = "; + if (hasResult()) { + printAsOperand(O, SlotTracker); + O << " = "; + } switch (getOpcode()) { case VPInstruction::Not: @@ -578,19 +580,10 @@ void VPlanPrinter::dump() { OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan"; if (!Plan.getName().empty()) OS << "\\n" << DOT::EscapeString(Plan.getName()); - if (!Plan.Value2VPValue.empty() || Plan.BackedgeTakenCount) { - OS << ", where:"; - if (Plan.BackedgeTakenCount) { - OS << "\\n"; - Plan.BackedgeTakenCount->print(OS, SlotTracker); - OS << " := BackedgeTakenCount"; - } - for (auto Entry : Plan.Value2VPValue) { - OS << "\\n"; - Entry.second->print(OS, SlotTracker); - OS << DOT::EscapeString(" := "); - Entry.first->printAsOperand(OS, false); - } + if (Plan.BackedgeTakenCount) { + OS << ", where:\\n"; + Plan.BackedgeTakenCount->print(OS, SlotTracker); + OS << " := BackedgeTakenCount"; } OS << "\"]\n"; OS << "node [shape=rect, fontname=Courier, fontsize=30]\n"; @@ -814,11 +807,18 @@ void VPValue::replaceAllUsesWith(VPValue *New) { } void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const { + if (const Value *UV = getUnderlyingValue()) { + OS << "ir<"; + UV->printAsOperand(OS, false); + OS << ">"; + return; + } + unsigned Slot = Tracker.getSlot(this); if (Slot == unsigned(-1)) OS << ""; else - OS << "%vp" << Tracker.getSlot(this); + OS << "vp<%" << Tracker.getSlot(this) << ">"; } void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region, @@ -869,6 +869,13 @@ VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan, void VPSlotTracker::assignSlot(const VPValue *V) { assert(Slots.find(V) == Slots.end() && "VPValue already has a slot!"); + const Value *UV = V->getUnderlyingValue(); + if (UV) + return; + const auto *VPI = dyn_cast(V); + if (VPI && !VPI->hasResult()) + return; + Slots[V] = NextSlot++; } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 3cd4464c1efe51..044d2ffc3fc3e1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -735,6 +735,26 @@ class VPInstruction : public VPUser, public VPRecipeBase { return Opcode == Instruction::Store || Opcode == Instruction::Call || Opcode == Instruction::Invoke || Opcode == SLPStore; } + + bool hasResult() const { + // CallInst may or may not have a result, depending on the called function. + // Conservatively return calls have results for now. + switch (getOpcode()) { + case Instruction::Ret: + case Instruction::Br: + case Instruction::Store: + case Instruction::Switch: + case Instruction::IndirectBr: + case Instruction::Resume: + case Instruction::CatchRet: + case Instruction::Unreachable: + case Instruction::Fence: + case Instruction::AtomicRMW: + return false; + default: + return true; + } + } }; /// VPWidenRecipe is a recipe for producing a copy of vector type for each @@ -1468,7 +1488,7 @@ class VPlan { void addVPValue(Value *V) { assert(V && "Trying to add a null Value to VPlan"); assert(!Value2VPValue.count(V) && "Value already exists in VPlan"); - Value2VPValue[V] = new VPValue(); + Value2VPValue[V] = new VPValue(V); } VPValue *getVPValue(Value *V) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 9004650fac84fa..abaf4dc43015cf 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -42,6 +42,7 @@ class VPValue { friend class VPlanTransforms; friend class VPBasicBlock; friend class VPInterleavedAccessInfo; + friend class VPSlotTracker; private: const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). @@ -64,6 +65,7 @@ class VPValue { /// Return the underlying Value attached to this VPValue. Value *getUnderlyingValue() { return UnderlyingVal; } + const Value *getUnderlyingValue() const { return UnderlyingVal; } // Set \p Val as the underlying Value of this VPValue. void setUnderlyingValue(Value *Val) { diff --git a/llvm/test/Analysis/CostModel/X86/reduce-and.ll b/llvm/test/Analysis/CostModel/X86/reduce-and.ll index 455f7326f6b791..9a558118e4cea4 100644 --- a/llvm/test/Analysis/CostModel/X86/reduce-and.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-and.ll @@ -270,9 +270,9 @@ define i32 @reduce_i1(i32 %arg) { ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i1' @@ -282,8 +282,8 @@ define i32 @reduce_i1(i32 %arg) { ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i1' @@ -293,8 +293,8 @@ define i32 @reduce_i1(i32 %arg) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i1' @@ -303,9 +303,9 @@ define i32 @reduce_i1(i32 %arg) { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i1' @@ -316,7 +316,7 @@ define i32 @reduce_i1(i32 %arg) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i1' @@ -325,9 +325,9 @@ define i32 @reduce_i1(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> undef) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-or.ll b/llvm/test/Analysis/CostModel/X86/reduce-or.ll index ee05562dc241e4..011a6cc1d051f2 100644 --- a/llvm/test/Analysis/CostModel/X86/reduce-or.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-or.ll @@ -270,9 +270,9 @@ define i32 @reduce_i1(i32 %arg) { ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i1' @@ -282,8 +282,8 @@ define i32 @reduce_i1(i32 %arg) { ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i1' @@ -293,8 +293,8 @@ define i32 @reduce_i1(i32 %arg) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i1' @@ -303,9 +303,9 @@ define i32 @reduce_i1(i32 %arg) { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i1' @@ -316,7 +316,7 @@ define i32 @reduce_i1(i32 %arg) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i1' @@ -325,9 +325,9 @@ define i32 @reduce_i1(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> undef) diff --git a/llvm/test/Analysis/MemorySSA/scalable-vec.ll b/llvm/test/Analysis/MemorySSA/scalable-vec.ll new file mode 100644 index 00000000000000..23072876a2d72c --- /dev/null +++ b/llvm/test/Analysis/MemorySSA/scalable-vec.ll @@ -0,0 +1,25 @@ +; RUN: opt -basicaa -print-memoryssa -verify-memoryssa -analyze < %s 2>&1 | FileCheck %s +; RUN: opt -aa-pipeline=basic-aa -passes='print' -verify-memoryssa -disable-output < %s 2>&1 | FileCheck %s + +; CHECK-LABEL: define @f( +; CHECK: 1 = MemoryDef(liveOnEntry) +; CHECK: MemoryUse(1) MustAlias +define @f( %z) { + %a = alloca + store %z, * %a + %zz = load , * %a + ret %zz +} + +; CHECK-LABEL: define i32 @g( +; CHECK: 1 = MemoryDef(liveOnEntry) +; CHECK: MemoryUse(1) MayAlias +declare i32* @gg(* %a) +define i32 @g(i32 %z, i32 *%bb) { + %a = alloca + %aa = getelementptr , * %a, i32 0, i32 0 + store i32 %z, i32* %aa + %bbb = call i32* @gg(* %a) readnone + %zz = load i32, i32* %bbb + ret i32 %zz +} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll index 661bd0d121e0bf..418d09d01fd33b 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll @@ -167,16 +167,6 @@ define void @nonpow2_load_narrowing() { ret void } -; FALLBACK-WITH-REPORT-ERR: remark: :0:0: cannot select: %{{[0-9]+}}:gpr64(s64), %{{[0-9]+}}:gpr(s1) = G_UADDE %{{[0-9]+}}:gpr, %{{[0-9]+}}:gpr, %{{[0-9]+}}:gpr (in function: nonpow2_store_narrowing) -; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_store_narrowing -; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_store_narrowing: -define void @nonpow2_store_narrowing(i96* %c) { - %a = add i128 undef, undef - %b = trunc i128 %a to i96 - store i96 %b, i96* %c - ret void -} - ; Currently can't handle vector lengths that aren't an exact multiple of ; natively supported vector lengths. Test that the fall-back works for those. ; FALLBACK-WITH-REPORT-ERR-G_IMPLICIT_DEF-LEGALIZABLE: (FIXME: this is what is expected once we can legalize non-pow-of-2 G_IMPLICIT_DEF) remark: :0:0: unable to legalize instruction: %1:_(<7 x s64>) = G_ADD %0, %0 (in function: nonpow2_vector_add_fewerelements diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-undef.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-undef.mir new file mode 100644 index 00000000000000..2c5f12ac9d8dd7 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-undef.mir @@ -0,0 +1,168 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple aarch64 -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + +name: uitofp_to_zero +alignment: 4 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: uitofp_to_zero + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK: $s0 = COPY [[C]](s32) + ; CHECK: RET_ReallyLR implicit $s0 + %0:_(s32) = G_IMPLICIT_DEF + %1:_(s32) = G_UITOFP %0(s32) + $s0 = COPY %1(s32) + RET_ReallyLR implicit $s0 + +... +--- +name: sitofp_to_zero +alignment: 4 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sitofp_to_zero + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK: $s0 = COPY [[C]](s32) + ; CHECK: RET_ReallyLR implicit $s0 + %0:_(s32) = G_IMPLICIT_DEF + %1:_(s32) = G_SITOFP %0(s32) + $s0 = COPY %1(s32) + RET_ReallyLR implicit $s0 + +... +--- +name: and_to_zero +alignment: 4 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: and_to_zero + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: $w0 = COPY [[C]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %0:_(s32) = G_CONSTANT i32 10 + %1:_(s32) = G_IMPLICIT_DEF + %2:_(s32) = G_AND %0, %1 + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: mul_to_zero +alignment: 4 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: mul_to_zero + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: $w0 = COPY [[C]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %0:_(s32) = G_CONSTANT i32 10 + %1:_(s32) = G_IMPLICIT_DEF + %2:_(s32) = G_MUL %0, %1 + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: or_to_negative_one +alignment: 4 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: or_to_negative_one + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK: $w0 = COPY [[C]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %0:_(s32) = G_CONSTANT i32 10 + %1:_(s32) = G_IMPLICIT_DEF + %2:_(s32) = G_OR %0, %1 + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: xor_to_undef +alignment: 4 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: xor_to_undef + ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: $w0 = COPY [[DEF]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %0:_(s32) = G_CONSTANT i32 10 + %1:_(s32) = G_IMPLICIT_DEF + %2:_(s32) = G_XOR %0, %1 + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: add_to_undef +alignment: 4 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: add_to_undef + ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: $w0 = COPY [[DEF]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %0:_(s32) = G_CONSTANT i32 10 + %1:_(s32) = G_IMPLICIT_DEF + %2:_(s32) = G_ADD %0, %1 + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: sub_to_undef +alignment: 4 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sub_to_undef + ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: $w0 = COPY [[DEF]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %0:_(s32) = G_CONSTANT i32 10 + %1:_(s32) = G_IMPLICIT_DEF + %2:_(s32) = G_SUB %0, %1 + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: fptoui_to_undef +alignment: 4 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: fptoui_to_undef + ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: $w0 = COPY [[DEF]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %0:_(s32) = G_IMPLICIT_DEF + %1:_(s32) = G_FPTOUI %0(s32) + $w0 = COPY %1(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: fptosi_to_undef +alignment: 4 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: fptosi_to_undef + ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: $w0 = COPY [[DEF]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %0:_(s32) = G_IMPLICIT_DEF + %1:_(s32) = G_FPTOSI %0(s32) + $w0 = COPY %1(s32) + RET_ReallyLR implicit $w0 + +... diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-iterative.mir b/llvm/test/CodeGen/AArch64/machine-outliner-iterative.mir new file mode 100644 index 00000000000000..08aa043f5c6839 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/machine-outliner-iterative.mir @@ -0,0 +1,149 @@ +# RUN: llc -mtriple=aarch64 -run-pass=machine-outliner -machine-outline-runs=2 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix TWO-RUNS +# RUN: llc -mtriple=aarch64 -run-pass=machine-outliner -machine-outline-runs=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix ONE-RUN +# RUN: llc -mtriple=aarch64 -run-pass=machine-outliner -machine-outline-runs=4 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix FOUR-RUNS + +# Example of Repeated Instruction Sequence - Iterative Machine Outlining +# +#; define void @"$s12"(...) { define i64 @"$s5” (...) { define void @"$s13"(...) { +# ... ... ... +# %8 = load i1, i1* %7 %8 = load i1, i1* %7 +# %9 = load i4, i4*, %6 %9 = load i4, i4*, %6 %9 = load i4, i4*, %6 +# store i4 %9, i4* %5 store i4 %9, i4* %5 store i4 %9, i4* %5 +# ... ... ... +# } } } +# +# After machine outliner (1st time) +# +# define void @"$s12"(...) { define i64 @"$s5” (...) { define void @"$s13"(...) { +# ... ... ... +# %8 = load i1, i1* %7 %8 = load i1, i1* %7 +# call void @outlined_function_1_1 call void @outlined_function_1_1 call void @outlined_function_1_1 +# ... ... ... +# } } } +# +# After machine outliner (2nd time) +# +# define void @"$s12"(...) { define i64 @"$s5” (...) { define void @"$s13"(...) { +# ... ... ... +# call void @outlined_function_2_1 call void @outlined_function_1_1 call void @outlined_function_2_1 +# ... ... ... +# } } } +# +# Check whether machine outliner can further find the outlining opportunity after machine +# outlining has performed. +# +--- | + declare void @foo() local_unnamed_addr + + declare void @widget() local_unnamed_addr + + ; Function Attrs: minsize noredzone optsize + define void @baz.14() #0 { + ret void + } + + ; Function Attrs: minsize noredzone optsize + define void @baz.15() #0 { + ret void + } + + ; Function Attrs: minsize noredzone optsize + define void @baz.16() #0 { + ret void + } + + attributes #0 = { minsize noredzone optsize } +... +--- +name: baz.14 +tracksRegLiveness: true +stack: + - { id: 0, offset: -8, size: 8 } + - { id: 1, offset: -16, size: 8 } +body: | + bb.0: + liveins: $x0, $x19, $lr + + early-clobber $sp = frame-setup STPXpre killed $lr, killed $x19, $sp, -2 :: (store 8 into %stack.1), (store 8 into %stack.0) + frame-setup CFI_INSTRUCTION def_cfa_offset 16 + frame-setup CFI_INSTRUCTION offset $w19, -8 + frame-setup CFI_INSTRUCTION offset $w30, -16 + renamable $x19 = COPY $x0 + renamable $x0 = nuw ADDXri $x0, 48, 0 + $x1 = ADDXri $sp, 0, 0 + dead $w2 = MOVi32imm 33, implicit-def $x2 + $x3 = COPY $xzr + BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit killed $x1, implicit killed $x2, implicit killed $x3, implicit-def $sp + $x0 = COPY killed renamable $x19 + BL @widget, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp + early-clobber $sp, $lr, $x19 = frame-destroy LDPXpost $sp, 2 :: (load 8 from %stack.1), (load 8 from %stack.0) + RET_ReallyLR + +... +--- +name: baz.15 +tracksRegLiveness: true +stack: + - { id: 0, offset: -8, size: 8 } + - { id: 1, offset: -16, size: 8 } +body: | + bb.0: + liveins: $x0, $x19, $lr + + early-clobber $sp = frame-setup STPXpre killed $lr, killed $x19, $sp, -2 :: (store 8 into %stack.1), (store 8 into %stack.0) + frame-setup CFI_INSTRUCTION def_cfa_offset 16 + frame-setup CFI_INSTRUCTION offset $w19, -8 + frame-setup CFI_INSTRUCTION offset $w30, -16 + renamable $x19 = COPY $x0 + renamable $x0 = nuw ADDXri killed renamable $x0, 16, 0 + $x1 = ADDXri $sp, 0, 0 + dead $w2 = MOVi32imm 33, implicit-def $x2 + $x3 = COPY $xzr + BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit $x1, implicit killed $x2, implicit killed $x3, implicit-def $sp + $x0 = COPY killed renamable $x19 + BL @widget, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp + early-clobber $sp, $lr, $x19 = frame-destroy LDPXpost $sp, 2 :: (load 8 from %stack.1), (load 8 from %stack.0) + RET_ReallyLR + +... +--- +name: baz.16 +tracksRegLiveness: true +stack: + - { id: 0, offset: -8, size: 8 } + - { id: 1, offset: -16, size: 8 } +body: | + bb.0: + liveins: $x0, $x19, $lr + + early-clobber $sp = frame-setup STPXpre killed $lr, killed $x19, $sp, -2 :: (store 8 into %stack.1), (store 8 into %stack.0) + frame-setup CFI_INSTRUCTION def_cfa_offset 16 + frame-setup CFI_INSTRUCTION offset $w19, -8 + frame-setup CFI_INSTRUCTION offset $w30, -16 + renamable $x19 = COPY $x0 + renamable $x0 = nuw ADDXri $x0, 48, 0 + $x1 = ADDXri $sp, 0, 0 + dead $w2 = MOVi32imm 33, implicit-def $x2 + $x3 = COPY $xzr + BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit killed $x1, implicit killed $x2, implicit killed $x3, implicit-def $sp + $x0 = COPY killed renamable $x19 + BL @widget, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp + early-clobber $sp, $lr, $x19 = frame-destroy LDPXpost $sp, 2 :: (load 8 from %stack.1), (load 8 from %stack.0) + RET_ReallyLR + +... + +# TWO-RUNS: name: OUTLINED_FUNCTION_2_0 +# TWO-RUNS-DAG: bb.0: +# TWO-RUNS-DAG: renamable $x19 = COPY $x0 +# TWO-RUNS-NEXT: renamable $x0 = nuw ADDXri $x0, 48, 0 +# TWO-RUNS-NEXT: TCRETURNdi @OUTLINED_FUNCTION_0, 0, implicit $sp +# +# The machine outliner is expected to stop at the 1st iteration for case ONE-RUN +# since machine-outline-runs is specified as 1. +# ONE-RUN-NOT: [[OUTLINED:OUTLINED_FUNCTION_2_[0-9]+]] +# +# The machine outliner is expected to stop at the 3rd iteration for case FOUR-RUNS +# since the MIR has no change at the 3rd iteration. +# FOUR-RUNS-NOT: [[OUTLINED:OUTLINED_FUNCTION_3_[0-9]+]] +# FOUR-RUNS-NOT: [[OUTLINED:OUTLINED_FUNCTION_4_[0-9]+]] diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-scaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-scaled-offset.ll index 78251707a0105b..2de5668d1b9ea0 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-scaled-offset.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-scaled-offset.ll @@ -1,200 +1,200 @@ ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s ; PRFB , , [, .S, ] -> 32-bit scaled offset -define void @llvm_aarch64_sve_gather_prfb_scaled_uxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_scaled_uxtw_nx4vi32: +define void @llvm_aarch64_sve_prfb_gather_scaled_uxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_scaled_uxtw_nx4vi32: ; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.s, uxtw] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfb.gather.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfb_scaled_sxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_scaled_sxtw_nx4vi32: +define void @llvm_aarch64_sve_prfb_gather_scaled_sxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_scaled_sxtw_nx4vi32: ; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.s, sxtw] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfb.gather.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) ret void } ; PRFB , , [, .D, ] -> 32-bit unpacked scaled offset -define void @llvm_aarch64_sve_gather_prfb_scaled_uxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_scaled_uxtw_nx2vi64: +define void @llvm_aarch64_sve_prfb_gather_scaled_uxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_scaled_uxtw_nx2vi64: ; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.d, uxtw] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfb.gather.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfb_scaled_sxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_scaled_sxtw_nx2vi64: +define void @llvm_aarch64_sve_prfb_gather_scaled_sxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_scaled_sxtw_nx2vi64: ; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.d, sxtw] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfb.gather.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) ret void } ; PRFB , , [, .D] -> 64-bit scaled offset -define void @llvm_aarch64_sve_gather_prfb_scaled_nx2vi64( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_scaled_nx2vi64: +define void @llvm_aarch64_sve_prfb_gather_scaled_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_scaled_nx2vi64: ; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.d] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfb.gather.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 1) ret void } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; PRFH , , [, .S, ] -> 32-bit scaled offset -define void @llvm_aarch64_sve_gather_prfh_scaled_uxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_scaled_uxtw_nx4vi32: +define void @llvm_aarch64_sve_prfh_gather_scaled_uxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_scaled_uxtw_nx4vi32: ; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.s, uxtw #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfh.gather.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfh_scaled_sxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_scaled_sxtw_nx4vi32: +define void @llvm_aarch64_sve_prfh_gather_scaled_sxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_scaled_sxtw_nx4vi32: ; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.s, sxtw #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfh.gather.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) ret void } ; PRFH , , [, .D, #1] -> 32-bit unpacked scaled offset -define void @llvm_aarch64_sve_gather_prfh_scaled_uxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_scaled_uxtw_nx2vi64: +define void @llvm_aarch64_sve_prfh_gather_scaled_uxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_scaled_uxtw_nx2vi64: ; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.d, uxtw #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfh.gather.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfh_scaled_sxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_scaled_sxtw_nx2vi64: +define void @llvm_aarch64_sve_prfh_gather_scaled_sxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_scaled_sxtw_nx2vi64: ; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.d, sxtw #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfh.gather.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) ret void } ; PRFH , , [, .D] -> 64-bit scaled offset -define void @llvm_aarch64_sve_gather_prfh_scaled_nx2vi64( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_scaled_nx2vi64: +define void @llvm_aarch64_sve_prfh_gather_scaled_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_scaled_nx2vi64: ; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.d, lsl #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfh.gather.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 1) ret void } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; PRFW , , [, .S, ] -> 32-bit scaled offset -define void @llvm_aarch64_sve_gather_prfw_scaled_uxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_scaled_uxtw_nx4vi32: +define void @llvm_aarch64_sve_prfw_gather_scaled_uxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_scaled_uxtw_nx4vi32: ; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.s, uxtw #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfw.gather.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfw_scaled_sxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_scaled_sxtw_nx4vi32: +define void @llvm_aarch64_sve_prfw_gather_scaled_sxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_scaled_sxtw_nx4vi32: ; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.s, sxtw #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfw.gather.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) ret void } ; PRFW , , [, .D, #2] -> 32-bit unpacked scaled offset -define void @llvm_aarch64_sve_gather_prfw_scaled_uxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_scaled_uxtw_nx2vi64: +define void @llvm_aarch64_sve_prfw_gather_scaled_uxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_scaled_uxtw_nx2vi64: ; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.d, uxtw #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfw.gather.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfw_scaled_sxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_scaled_sxtw_nx2vi64: +define void @llvm_aarch64_sve_prfw_gather_scaled_sxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_scaled_sxtw_nx2vi64: ; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.d, sxtw #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfw.gather.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) ret void } ; PRFW , , [, .D] -> 64-bit scaled offset -define void @llvm_aarch64_sve_gather_prfw_scaled_nx2vi64( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_scaled_nx2vi64: +define void @llvm_aarch64_sve_prfw_gather_scaled_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_scaled_nx2vi64: ; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.d, lsl #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfw.gather.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 1) ret void } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; PRFD , , [, .S, ] -> 32-bit scaled offset -define void @llvm_aarch64_sve_gather_prfd_scaled_uxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_scaled_uxtw_nx4vi32: +define void @llvm_aarch64_sve_prfd_gather_scaled_uxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_scaled_uxtw_nx4vi32: ; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.s, uxtw #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfd.gather.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfd_scaled_sxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_scaled_sxtw_nx4vi32: +define void @llvm_aarch64_sve_prfd_gather_scaled_sxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_scaled_sxtw_nx4vi32: ; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.s, sxtw #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfd.gather.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) ret void } ; PRFD , , [, .D, #3] -> 32-bit unpacked scaled offset -define void @llvm_aarch64_sve_gather_prfd_scaled_uxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_scaled_uxtw_nx2vi64: +define void @llvm_aarch64_sve_prfd_gather_scaled_uxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_scaled_uxtw_nx2vi64: ; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.d, uxtw #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfd.gather.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfd_scaled_sxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_scaled_sxtw_nx2vi64: +define void @llvm_aarch64_sve_prfd_gather_scaled_sxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_scaled_sxtw_nx2vi64: ; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.d, sxtw #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfd.gather.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) ret void } ; PRFD , , [, .D] -> 64-bit scaled offset -define void @llvm_aarch64_sve_gather_prfd_scaled_nx2vi64( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_scaled_nx2vi64: +define void @llvm_aarch64_sve_prfd_gather_scaled_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_scaled_nx2vi64: ; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.d, lsl #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 1) - ret void - } - -declare void @llvm.aarch64.sve.gather.prfb.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfb.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfb.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfb.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfb.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfh.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfh.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfh.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfh.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfh.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfw.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfw.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfw.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfw.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfw.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfd.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfd.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfd.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfd.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfd.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) + call void @llvm.aarch64.sve.prfd.gather.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 1) + ret void + } + +declare void @llvm.aarch64.sve.prfb.gather.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfb.gather.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfb.gather.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfb.gather.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfb.gather.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfh.gather.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfh.gather.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfh.gather.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfh.gather.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfh.gather.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfw.gather.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfw.gather.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfw.gather.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfw.gather.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfw.gather.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfd.gather.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfd.gather.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfd.gather.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfd.gather.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfd.gather.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-imm-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-imm-offset.ll index 481302ce597209..8be10be55f278c 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-imm-offset.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-imm-offset.ll @@ -1,82 +1,82 @@ ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s ; PRFB , , [.S{, #}] -> 32-bit element -define void @llvm_aarch64_sve_gather_prfb_nx4vi32( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx4vi32: +define void @llvm_aarch64_sve_prfb_gather_nx4vi32( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_nx4vi32: ; CHECK-NEXT: prfb pldl1strm, p0, [z0.s, #7] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.nx4vi32( %Pg, %bases, i64 7, i32 1) + call void @llvm.aarch64.sve.prfb.gather.nx4vi32( %Pg, %bases, i64 7, i32 1) ret void } ; PRFB , , [.D{, #}] -> 64-bit element -define void @llvm_aarch64_sve_gather_prfb_nx2vi64( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx2vi64: +define void @llvm_aarch64_sve_prfb_gather_nx2vi64( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_nx2vi64: ; CHECK-NEXT: prfb pldl1strm, p0, [z0.d, #7] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.nx2vi64( %Pg, %bases, i64 7, i32 1) + call void @llvm.aarch64.sve.prfb.gather.nx2vi64( %Pg, %bases, i64 7, i32 1) ret void } ; PRFH , , [.S{, #}] -> 32-bit element -define void @llvm_aarch64_sve_gather_prfh_nx4vi32( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx4vi32: +define void @llvm_aarch64_sve_prfh_gather_nx4vi32( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_nx4vi32: ; CHECK-NEXT: prfh pldl1strm, p0, [z0.s, #6] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.nx4vi32( %Pg, %bases, i64 6, i32 1) + call void @llvm.aarch64.sve.prfh.gather.nx4vi32( %Pg, %bases, i64 6, i32 1) ret void } ; PRFH , , [.D{, #}] -> 64-bit element -define void @llvm_aarch64_sve_gather_prfh_nx2vi64( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx2vi64: +define void @llvm_aarch64_sve_prfh_gather_nx2vi64( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_nx2vi64: ; CHECK-NEXT: prfh pldl1strm, p0, [z0.d, #6] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.nx2vi64( %Pg, %bases, i64 6, i32 1) + call void @llvm.aarch64.sve.prfh.gather.nx2vi64( %Pg, %bases, i64 6, i32 1) ret void } ; PRFW , , [.S{, #}] -> 32-bit element -define void @llvm_aarch64_sve_gather_prfw_nx4vi32( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx4vi32: +define void @llvm_aarch64_sve_prfw_gather_nx4vi32( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_nx4vi32: ; CHECK-NEXT: prfw pldl1strm, p0, [z0.s, #12] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.nx4vi32( %Pg, %bases, i64 12, i32 1) + call void @llvm.aarch64.sve.prfw.gather.nx4vi32( %Pg, %bases, i64 12, i32 1) ret void } ; PRFW , , [.D{, #}] -> 64-bit element -define void @llvm_aarch64_sve_gather_prfw_nx2vi64( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx2vi64: +define void @llvm_aarch64_sve_prfw_gather_nx2vi64( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_nx2vi64: ; CHECK-NEXT: prfw pldl1strm, p0, [z0.d, #12] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.nx2vi64( %Pg, %bases, i64 12, i32 1) + call void @llvm.aarch64.sve.prfw.gather.nx2vi64( %Pg, %bases, i64 12, i32 1) ret void } ; PRFD , , [.S{, #}] -> 32-bit element -define void @llvm_aarch64_sve_gather_prfd_nx4vi32( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx4vi32: +define void @llvm_aarch64_sve_prfd_gather_nx4vi32( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_nx4vi32: ; CHECK-NEXT: prfd pldl1strm, p0, [z0.s, #16] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.nx4vi32( %Pg, %bases, i64 16, i32 1) + call void @llvm.aarch64.sve.prfd.gather.nx4vi32( %Pg, %bases, i64 16, i32 1) ret void } ; PRFD , , [.D{, #}] -> 64-bit element -define void @llvm_aarch64_sve_gather_prfd_nx2vi64( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx2vi64: +define void @llvm_aarch64_sve_prfd_gather_nx2vi64( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_nx2vi64: ; CHECK-NEXT: prfd pldl1strm, p0, [z0.d, #16] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.nx2vi64( %Pg, %bases, i64 16, i32 1) + call void @llvm.aarch64.sve.prfd.gather.nx2vi64( %Pg, %bases, i64 16, i32 1) ret void } -declare void @llvm.aarch64.sve.gather.prfb.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfb.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfh.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfh.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfw.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfw.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfd.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfd.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfb.gather.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfb.gather.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfh.gather.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfh.gather.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfw.gather.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfw.gather.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfd.gather.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfd.gather.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-invalid-imm-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-invalid-imm-offset.ll index 4b0b42eb73b98e..ca027edfd5def1 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-invalid-imm-offset.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-invalid-imm-offset.ll @@ -1,286 +1,286 @@ ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s ; PRFB , , [.S{, #}] -> 32-bit element, imm = 0, 1, ..., 31 -define void @llvm_aarch64_sve_gather_prfb_nx4vi32_runtime_offset( %bases, i64 %imm, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx4vi32_runtime_offset: +define void @llvm_aarch64_sve_prfb_gather_nx4vi32_runtime_offset( %bases, i64 %imm, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_nx4vi32_runtime_offset: ; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.s, uxtw] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.nx4vi32( %Pg, %bases, i64 %imm, i32 1) + call void @llvm.aarch64.sve.prfb.gather.nx4vi32( %Pg, %bases, i64 %imm, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfb_nx4vi32_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx4vi32_invalid_immediate_offset_upper_bound: +define void @llvm_aarch64_sve_prfb_gather_nx4vi32_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_nx4vi32_invalid_immediate_offset_upper_bound: ; CHECK-NEXT: mov w[[N:[0-9]+]], #32 ; CHECK-NEXT: prfb pldl1strm, p0, [x[[N]], z0.s, uxtw] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.nx4vi32( %Pg, %bases, i64 32, i32 1) + call void @llvm.aarch64.sve.prfb.gather.nx4vi32( %Pg, %bases, i64 32, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfb_nx4vi32_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx4vi32_invalid_immediate_offset_lower_bound: +define void @llvm_aarch64_sve_prfb_gather_nx4vi32_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_nx4vi32_invalid_immediate_offset_lower_bound: ; CHECK-NEXT: mov x[[N:[0-9]+]], #-1 ; CHECK-NEXT: prfb pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.nx4vi32( %Pg, %bases, i64 -1, i32 1) + call void @llvm.aarch64.sve.prfb.gather.nx4vi32( %Pg, %bases, i64 -1, i32 1) ret void } ; PRFB , , [.D{, #}] -> 64-bit element, imm = 0, 1, ..., 31 -define void @llvm_aarch64_sve_gather_prfb_nx2vi64_runtime_offset( %bases, i64 %imm, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx2vi64_runtime_offset: +define void @llvm_aarch64_sve_prfb_gather_nx2vi64_runtime_offset( %bases, i64 %imm, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_nx2vi64_runtime_offset: ; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.d, uxtw] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.nx2vi64( %Pg, %bases, i64 %imm, i32 1) + call void @llvm.aarch64.sve.prfb.gather.nx2vi64( %Pg, %bases, i64 %imm, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfb_nx2vi64_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx2vi64_invalid_immediate_offset_upper_bound: +define void @llvm_aarch64_sve_prfb_gather_nx2vi64_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_nx2vi64_invalid_immediate_offset_upper_bound: ; CHECK-NEXT: mov w[[N:[0-9]+]], #32 ; CHECK-NEXT: prfb pldl1strm, p0, [x[[N]], z0.d, uxtw] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.nx2vi64( %Pg, %bases, i64 32, i32 1) + call void @llvm.aarch64.sve.prfb.gather.nx2vi64( %Pg, %bases, i64 32, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfb_nx2vi64_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx2vi64_invalid_immediate_offset_lower_bound: +define void @llvm_aarch64_sve_prfb_gather_nx2vi64_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_nx2vi64_invalid_immediate_offset_lower_bound: ; CHECK-NEXT: mov x[[N:[0-9]+]], #-1 ; CHECK-NEXT: prfb pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.nx2vi64( %Pg, %bases, i64 -1, i32 1) + call void @llvm.aarch64.sve.prfb.gather.nx2vi64( %Pg, %bases, i64 -1, i32 1) ret void } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; PRFH , , [.S{, #}] -> 32-bit element, imm = 0, 2, ..., 62 -define void @llvm_aarch64_sve_gather_prfh_nx4vi32_runtime_offset( %bases, i64 %imm, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx4vi32_runtime_offset: +define void @llvm_aarch64_sve_prfh_gather_nx4vi32_runtime_offset( %bases, i64 %imm, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_nx4vi32_runtime_offset: ; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.s, uxtw #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.nx4vi32( %Pg, %bases, i64 %imm, i32 1) + call void @llvm.aarch64.sve.prfh.gather.nx4vi32( %Pg, %bases, i64 %imm, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfh_nx4vi32_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx4vi32_invalid_immediate_offset_upper_bound: +define void @llvm_aarch64_sve_prfh_gather_nx4vi32_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_nx4vi32_invalid_immediate_offset_upper_bound: ; CHECK-NEXT: mov w[[N:[0-9]+]], #63 ; CHECK-NEXT: prfh pldl1strm, p0, [x[[N]], z0.s, uxtw #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.nx4vi32( %Pg, %bases, i64 63, i32 1) + call void @llvm.aarch64.sve.prfh.gather.nx4vi32( %Pg, %bases, i64 63, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfh_nx4vi32_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx4vi32_invalid_immediate_offset_lower_bound: +define void @llvm_aarch64_sve_prfh_gather_nx4vi32_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_nx4vi32_invalid_immediate_offset_lower_bound: ; CHECK-NEXT: mov x[[N:[0-9]+]], #-1 ; CHECK-NEXT: prfh pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.nx4vi32( %Pg, %bases, i64 -1, i32 1) + call void @llvm.aarch64.sve.prfh.gather.nx4vi32( %Pg, %bases, i64 -1, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfh_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_2( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_2: +define void @llvm_aarch64_sve_prfh_gather_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_2( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_2: ; CHECK-NEXT: mov w[[N:[0-9]+]], #33 ; CHECK-NEXT: prfh pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.nx4vi32( %Pg, %bases, i64 33, i32 1) + call void @llvm.aarch64.sve.prfh.gather.nx4vi32( %Pg, %bases, i64 33, i32 1) ret void } ; PRFH , , [.D{, #}] -> 64-bit element, imm = 0, 2, ..., 62 -define void @llvm_aarch64_sve_gather_prfh_nx2vi64_runtime_offset( %bases, i64 %imm, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx2vi64_runtime_offset: +define void @llvm_aarch64_sve_prfh_gather_nx2vi64_runtime_offset( %bases, i64 %imm, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_nx2vi64_runtime_offset: ; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.d, uxtw #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.nx2vi64( %Pg, %bases, i64 %imm, i32 1) + call void @llvm.aarch64.sve.prfh.gather.nx2vi64( %Pg, %bases, i64 %imm, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfh_nx2vi64_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx2vi64_invalid_immediate_offset_upper_bound: +define void @llvm_aarch64_sve_prfh_gather_nx2vi64_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_nx2vi64_invalid_immediate_offset_upper_bound: ; CHECK-NEXT: mov w[[N:[0-9]+]], #63 ; CHECK-NEXT: prfh pldl1strm, p0, [x[[N]], z0.d, uxtw #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.nx2vi64( %Pg, %bases, i64 63, i32 1) + call void @llvm.aarch64.sve.prfh.gather.nx2vi64( %Pg, %bases, i64 63, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfh_nx2vi64_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx2vi64_invalid_immediate_offset_lower_bound: +define void @llvm_aarch64_sve_prfh_gather_nx2vi64_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_nx2vi64_invalid_immediate_offset_lower_bound: ; CHECK-NEXT: mov x[[N:[0-9]+]], #-1 ; CHECK-NEXT: prfh pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.nx2vi64( %Pg, %bases, i64 -1, i32 1) + call void @llvm.aarch64.sve.prfh.gather.nx2vi64( %Pg, %bases, i64 -1, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfh_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_2( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_2: +define void @llvm_aarch64_sve_prfh_gather_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_2( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_2: ; CHECK-NEXT: mov w[[N:[0-9]+]], #33 ; CHECK-NEXT: prfh pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.nx2vi64( %Pg, %bases, i64 33, i32 1) + call void @llvm.aarch64.sve.prfh.gather.nx2vi64( %Pg, %bases, i64 33, i32 1) ret void } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; PRFW , , [.S{, #}] -> 32-bit element, imm = 0, 4, ..., 124 -define void @llvm_aarch64_sve_gather_prfw_nx4vi32_runtime_offset( %bases, i64 %imm, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx4vi32_runtime_offset: +define void @llvm_aarch64_sve_prfw_gather_nx4vi32_runtime_offset( %bases, i64 %imm, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_nx4vi32_runtime_offset: ; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.s, uxtw #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.nx4vi32( %Pg, %bases, i64 %imm, i32 1) + call void @llvm.aarch64.sve.prfw.gather.nx4vi32( %Pg, %bases, i64 %imm, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfw_nx4vi32_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx4vi32_invalid_immediate_offset_upper_bound: +define void @llvm_aarch64_sve_prfw_gather_nx4vi32_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_nx4vi32_invalid_immediate_offset_upper_bound: ; CHECK-NEXT: mov w[[N:[0-9]+]], #125 ; CHECK-NEXT: prfw pldl1strm, p0, [x[[N]], z0.s, uxtw #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.nx4vi32( %Pg, %bases, i64 125, i32 1) + call void @llvm.aarch64.sve.prfw.gather.nx4vi32( %Pg, %bases, i64 125, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfw_nx4vi32_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx4vi32_invalid_immediate_offset_lower_bound: +define void @llvm_aarch64_sve_prfw_gather_nx4vi32_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_nx4vi32_invalid_immediate_offset_lower_bound: ; CHECK-NEXT: mov x[[N:[0-9]+]], #-1 ; CHECK-NEXT: prfw pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.nx4vi32( %Pg, %bases, i64 -1, i32 1) + call void @llvm.aarch64.sve.prfw.gather.nx4vi32( %Pg, %bases, i64 -1, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfw_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_4( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_4: +define void @llvm_aarch64_sve_prfw_gather_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_4( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_4: ; CHECK-NEXT: mov w[[N:[0-9]+]], #33 ; CHECK-NEXT: prfw pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.nx4vi32( %Pg, %bases, i64 33, i32 1) + call void @llvm.aarch64.sve.prfw.gather.nx4vi32( %Pg, %bases, i64 33, i32 1) ret void } ; PRFW , , [.D{, #}] -> 64-bit element, imm = 0, 4, ..., 124 -define void @llvm_aarch64_sve_gather_prfw_nx2vi64_runtime_offset( %bases, i64 %imm, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx2vi64_runtime_offset: +define void @llvm_aarch64_sve_prfw_gather_nx2vi64_runtime_offset( %bases, i64 %imm, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_nx2vi64_runtime_offset: ; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.d, uxtw #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.nx2vi64( %Pg, %bases, i64 %imm, i32 1) + call void @llvm.aarch64.sve.prfw.gather.nx2vi64( %Pg, %bases, i64 %imm, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfw_nx2vi64_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx2vi64_invalid_immediate_offset_upper_bound: +define void @llvm_aarch64_sve_prfw_gather_nx2vi64_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_nx2vi64_invalid_immediate_offset_upper_bound: ; CHECK-NEXT: mov w[[N:[0-9]+]], #125 ; CHECK-NEXT: prfw pldl1strm, p0, [x[[N]], z0.d, uxtw #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.nx2vi64( %Pg, %bases, i64 125, i32 1) + call void @llvm.aarch64.sve.prfw.gather.nx2vi64( %Pg, %bases, i64 125, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfw_nx2vi64_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx2vi64_invalid_immediate_offset_lower_bound: +define void @llvm_aarch64_sve_prfw_gather_nx2vi64_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_nx2vi64_invalid_immediate_offset_lower_bound: ; CHECK-NEXT: mov x[[N:[0-9]+]], #-1 ; CHECK-NEXT: prfw pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.nx2vi64( %Pg, %bases, i64 -1, i32 1) + call void @llvm.aarch64.sve.prfw.gather.nx2vi64( %Pg, %bases, i64 -1, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfw_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_4( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_4: +define void @llvm_aarch64_sve_prfw_gather_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_4( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_4: ; CHECK-NEXT: mov w[[N:[0-9]+]], #33 ; CHECK-NEXT: prfw pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.nx2vi64( %Pg, %bases, i64 33, i32 1) + call void @llvm.aarch64.sve.prfw.gather.nx2vi64( %Pg, %bases, i64 33, i32 1) ret void } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; PRFD , , [.S{, #}] -> 32-bit element, imm = 0, 8, ..., 248 -define void @llvm_aarch64_sve_gather_prfd_nx4vi32_runtime_offset( %bases, i64 %imm, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx4vi32_runtime_offset: +define void @llvm_aarch64_sve_prfd_gather_nx4vi32_runtime_offset( %bases, i64 %imm, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_nx4vi32_runtime_offset: ; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.s, uxtw #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.nx4vi32( %Pg, %bases, i64 %imm, i32 1) + call void @llvm.aarch64.sve.prfd.gather.nx4vi32( %Pg, %bases, i64 %imm, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfd_nx4vi32_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx4vi32_invalid_immediate_offset_upper_bound: +define void @llvm_aarch64_sve_prfd_gather_nx4vi32_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_nx4vi32_invalid_immediate_offset_upper_bound: ; CHECK-NEXT: mov w[[N:[0-9]+]], #125 ; CHECK-NEXT: prfd pldl1strm, p0, [x[[N]], z0.s, uxtw #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.nx4vi32( %Pg, %bases, i64 125, i32 1) + call void @llvm.aarch64.sve.prfd.gather.nx4vi32( %Pg, %bases, i64 125, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfd_nx4vi32_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx4vi32_invalid_immediate_offset_lower_bound: +define void @llvm_aarch64_sve_prfd_gather_nx4vi32_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_nx4vi32_invalid_immediate_offset_lower_bound: ; CHECK-NEXT: mov x[[N:[0-9]+]], #-1 ; CHECK-NEXT: prfd pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.nx4vi32( %Pg, %bases, i64 -1, i32 1) + call void @llvm.aarch64.sve.prfd.gather.nx4vi32( %Pg, %bases, i64 -1, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfd_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_8( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_8: +define void @llvm_aarch64_sve_prfd_gather_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_8( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_8: ; CHECK-NEXT: mov w[[N:[0-9]+]], #33 ; CHECK-NEXT: prfd pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.nx4vi32( %Pg, %bases, i64 33, i32 1) + call void @llvm.aarch64.sve.prfd.gather.nx4vi32( %Pg, %bases, i64 33, i32 1) ret void } ; PRFD , , [.D{, #}] -> 64-bit element, imm = 0, 4, ..., 248 -define void @llvm_aarch64_sve_gather_prfd_nx2vi64_runtime_offset( %bases, i64 %imm, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx2vi64_runtime_offset: +define void @llvm_aarch64_sve_prfd_gather_nx2vi64_runtime_offset( %bases, i64 %imm, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_nx2vi64_runtime_offset: ; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.d, uxtw #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.nx2vi64( %Pg, %bases, i64 %imm, i32 1) + call void @llvm.aarch64.sve.prfd.gather.nx2vi64( %Pg, %bases, i64 %imm, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfd_nx2vi64_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx2vi64_invalid_immediate_offset_upper_bound: +define void @llvm_aarch64_sve_prfd_gather_nx2vi64_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_nx2vi64_invalid_immediate_offset_upper_bound: ; CHECK-NEXT: mov w[[N:[0-9]+]], #125 ; CHECK-NEXT: prfd pldl1strm, p0, [x[[N]], z0.d, uxtw #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.nx2vi64( %Pg, %bases, i64 125, i32 1) + call void @llvm.aarch64.sve.prfd.gather.nx2vi64( %Pg, %bases, i64 125, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfd_nx2vi64_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx2vi64_invalid_immediate_offset_lower_bound: +define void @llvm_aarch64_sve_prfd_gather_nx2vi64_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_nx2vi64_invalid_immediate_offset_lower_bound: ; CHECK-NEXT: mov x[[N:[0-9]+]], #-1 ; CHECK-NEXT: prfd pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.nx2vi64( %Pg, %bases, i64 -1, i32 1) + call void @llvm.aarch64.sve.prfd.gather.nx2vi64( %Pg, %bases, i64 -1, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfd_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_8( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_8: +define void @llvm_aarch64_sve_prfd_gather_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_8( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_8: ; CHECK-NEXT: mov w[[N:[0-9]+]], #33 ; CHECK-NEXT: prfd pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.nx2vi64( %Pg, %bases, i64 33, i32 1) + call void @llvm.aarch64.sve.prfd.gather.nx2vi64( %Pg, %bases, i64 33, i32 1) ret void } -declare void @llvm.aarch64.sve.gather.prfb.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfb.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfh.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfh.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfw.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfw.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfd.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfd.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfb.gather.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfb.gather.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfh.gather.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfh.gather.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfw.gather.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfw.gather.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfd.gather.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfd.gather.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-shifts-merging.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-shifts-merging.ll new file mode 100644 index 00000000000000..7f5105da675e1e --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-shifts-merging.ll @@ -0,0 +1,340 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; ASR +; + +define @asr_i8( %pg, %a, %b) { +; CHECK-LABEL: asr_i8: +; CHECK: movprfx z0.b, p0/z, z0.b +; CHECK-NEXT: asr z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.asr.nxv16i8( %pg, + %a_z, + %b) + ret %out +} + +define @asr_i16( %pg, %a, %b) { +; CHECK-LABEL: asr_i16: +; CHECK: movprfx z0.h, p0/z, z0.h +; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.asr.nxv8i16( %pg, + %a_z, + %b) + ret %out +} + +define @asr_i32( %pg, %a, %b) { +; CHECK-LABEL: asr_i32: +; CHECK: movprfx z0.s, p0/z, z0.s +; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.asr.nxv4i32( %pg, + %a_z, + %b) + ret %out +} + +define @asr_i64( %pg, %a, %b) { +; CHECK-LABEL: asr_i64: +; CHECK: movprfx z0.d, p0/z, z0.d +; CHECK-NEXT: asr z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.asr.nxv2i64( %pg, + %a_z, + %b) + ret %out +} + +define @asr_wide_i8( %pg, %a, %b) { +; CHECK-LABEL: asr_wide_i8: +; CHECK-NOT: movprfx +; CHECK: asr z0.b, p0/m, z0.b, z1.d + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.asr.wide.nxv16i8( %pg, + %a_z, + %b) + ret %out +} + +define @asr_wide_i16( %pg, %a, %b) { +; CHECK-LABEL: asr_wide_i16: +; CHECK-NOT: movprfx +; CHECK: asr z0.h, p0/m, z0.h, z1.d + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.asr.wide.nxv8i16( %pg, + %a_z, + %b) + ret %out +} + +define @asr_wide_i32( %pg, %a, %b) { +; CHECK-LABEL: asr_wide_i32: +; CHECK-NOT: movprfx +; CHECK: asr z0.s, p0/m, z0.s, z1.d + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.asr.wide.nxv4i32( %pg, + %a_z, + %b) + ret %out +} + +; +; ASRD +; + +define @asrd_i8( %pg, %a) { +; CHECK-LABEL: asrd_i8: +; CHECK: movprfx z0.b, p0/z, z0.b +; CHECK-NEXT: asrd z0.b, p0/m, z0.b, #1 +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.asrd.nxv16i8( %pg, + %a_z, + i32 1) + ret %out +} + +define @asrd_i16( %pg, %a) { +; CHECK-LABEL: asrd_i16: +; CHECK: movprfx z0.h, p0/z, z0.h +; CHECK-NEXT: asrd z0.h, p0/m, z0.h, #2 +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.asrd.nxv8i16( %pg, + %a_z, + i32 2) + ret %out +} + +define @asrd_i32( %pg, %a) { +; CHECK-LABEL: asrd_i32: +; CHECK: movprfx z0.s, p0/z, z0.s +; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #31 +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.asrd.nxv4i32( %pg, + %a_z, + i32 31) + ret %out +} + +define @asrd_i64( %pg, %a) { +; CHECK-LABEL: asrd_i64: +; CHECK: movprfx z0.d, p0/z, z0.d +; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #64 +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.asrd.nxv2i64( %pg, + %a_z, + i32 64) + ret %out +} + +; +; LSL +; + +define @lsl_i8( %pg, %a, %b) { +; CHECK-LABEL: lsl_i8: +; CHECK: movprfx z0.b, p0/z, z0.b +; CHECK-NEXT: lsl z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsl.nxv16i8( %pg, + %a_z, + %b) + ret %out +} + +define @lsl_i16( %pg, %a, %b) { +; CHECK-LABEL: lsl_i16: +; CHECK: movprfx z0.h, p0/z, z0.h +; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsl.nxv8i16( %pg, + %a_z, + %b) + ret %out +} + +define @lsl_i32( %pg, %a, %b) { +; CHECK-LABEL: lsl_i32: +; CHECK: movprfx z0.s, p0/z, z0.s +; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsl.nxv4i32( %pg, + %a_z, + %b) + ret %out +} + +define @lsl_i64( %pg, %a, %b) { +; CHECK-LABEL: lsl_i64: +; CHECK: movprfx z0.d, p0/z, z0.d +; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsl.nxv2i64( %pg, + %a_z, + %b) + ret %out +} + +define @lsl_wide_i8( %pg, %a, %b) { +; CHECK-LABEL: lsl_wide_i8: +; CHECK-NOT: movprfx +; CHECK: lsl z0.b, p0/m, z0.b, z1.d + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsl.wide.nxv16i8( %pg, + %a_z, + %b) + ret %out +} + +define @lsl_wide_i16( %pg, %a, %b) { +; CHECK-LABEL: lsl_wide_i16: +; CHECK-NOT: movprfx +; CHECK: lsl z0.h, p0/m, z0.h, z1.d + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsl.wide.nxv8i16( %pg, + %a_z, + %b) + ret %out +} + +define @lsl_wide_i32( %pg, %a, %b) { +; CHECK-LABEL: lsl_wide_i32: +; CHECK-NOT: movprfx +; CHECK: lsl z0.s, p0/m, z0.s, z1.d + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsl.wide.nxv4i32( %pg, + %a_z, + %b) + ret %out +} + +; +; LSR +; + +define @lsr_i8( %pg, %a, %b) { +; CHECK-LABEL: lsr_i8: +; CHECK: movprfx z0.b, p0/z, z0.b +; CHECK-NEXT: lsr z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsr.nxv16i8( %pg, + %a_z, + %b) + ret %out +} + +define @lsr_i16( %pg, %a, %b) { +; CHECK-LABEL: lsr_i16: +; CHECK: movprfx z0.h, p0/z, z0.h +; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsr.nxv8i16( %pg, + %a_z, + %b) + ret %out +} + +define @lsr_i32( %pg, %a, %b) { +; CHECK-LABEL: lsr_i32: +; CHECK: movprfx z0.s, p0/z, z0.s +; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsr.nxv4i32( %pg, + %a_z, + %b) + ret %out +} + +define @lsr_i64( %pg, %a, %b) { +; CHECK-LABEL: lsr_i64: +; CHECK: movprfx z0.d, p0/z, z0.d +; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsr.nxv2i64( %pg, + %a_z, + %b) + ret %out +} + +define @lsr_wide_i8( %pg, %a, %b) { +; CHECK-LABEL: lsr_wide_i8: +; CHECK-NOT: movprfx +; CHECK: lsr z0.b, p0/m, z0.b, z1.d + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsr.wide.nxv16i8( %pg, + %a_z, + %b) + ret %out +} + +define @lsr_wide_i16( %pg, %a, %b) { +; CHECK-LABEL: lsr_wide_i16: +; CHECK-NOT: movprfx +; CHECK: lsr z0.h, p0/m, z0.h, z1.d + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsr.wide.nxv8i16( %pg, + %a_z, + %b) + ret %out +} + +define @lsr_wide_i32( %pg, %a, %b) { +; CHECK-LABEL: lsr_wide_i32: +; CHECK-NOT: movprfx +; CHECK: lsr z0.s, p0/m, z0.s, z1.d + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsr.wide.nxv4i32( %pg, + %a_z, + %b) + ret %out +} + +declare @llvm.aarch64.sve.asr.nxv16i8(, , ) +declare @llvm.aarch64.sve.asr.nxv8i16(, , ) +declare @llvm.aarch64.sve.asr.nxv4i32(, , ) +declare @llvm.aarch64.sve.asr.nxv2i64(, , ) + +declare @llvm.aarch64.sve.asr.wide.nxv16i8(, , ) +declare @llvm.aarch64.sve.asr.wide.nxv8i16(, , ) +declare @llvm.aarch64.sve.asr.wide.nxv4i32(, , ) + +declare @llvm.aarch64.sve.asrd.nxv16i8(, , i32) +declare @llvm.aarch64.sve.asrd.nxv8i16(, , i32) +declare @llvm.aarch64.sve.asrd.nxv4i32(, , i32) +declare @llvm.aarch64.sve.asrd.nxv2i64(, , i32) + +declare @llvm.aarch64.sve.lsl.nxv16i8(, , ) +declare @llvm.aarch64.sve.lsl.nxv8i16(, , ) +declare @llvm.aarch64.sve.lsl.nxv4i32(, , ) +declare @llvm.aarch64.sve.lsl.nxv2i64(, , ) + +declare @llvm.aarch64.sve.lsl.wide.nxv16i8(, , ) +declare @llvm.aarch64.sve.lsl.wide.nxv8i16(, , ) +declare @llvm.aarch64.sve.lsl.wide.nxv4i32(, , ) + +declare @llvm.aarch64.sve.lsr.nxv16i8(, , ) +declare @llvm.aarch64.sve.lsr.nxv8i16(, , ) +declare @llvm.aarch64.sve.lsr.nxv4i32(, , ) +declare @llvm.aarch64.sve.lsr.nxv2i64(, , ) + +declare @llvm.aarch64.sve.lsr.wide.nxv16i8(, , ) +declare @llvm.aarch64.sve.lsr.wide.nxv8i16(, , ) +declare @llvm.aarch64.sve.lsr.wide.nxv4i32(, , ) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll index fad2fcbd08f9b9..aee2404071937e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll @@ -2903,15 +2903,12 @@ define amdgpu_ps <4 x float> @getresinfo_dmask0(<8 x i32> inreg %rsrc, <4 x floa ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 0, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 - ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) - ; GFX9: $vgpr1 = COPY [[DEF]](s32) - ; GFX9: $vgpr2 = COPY [[DEF]](s32) - ; GFX9: $vgpr3 = COPY [[DEF]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<4 x s32>) + ; GFX9: $vgpr0 = COPY [[UV]](s32) + ; GFX9: $vgpr1 = COPY [[UV1]](s32) + ; GFX9: $vgpr2 = COPY [[UV2]](s32) + ; GFX9: $vgpr3 = COPY [[UV3]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; GFX10NSA-LABEL: name: getresinfo_dmask0 ; GFX10NSA: bb.1.main_body: @@ -2925,15 +2922,12 @@ define amdgpu_ps <4 x float> @getresinfo_dmask0(<8 x i32> inreg %rsrc, <4 x floa ; GFX10NSA: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 0, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0 - ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) - ; GFX10NSA: $vgpr1 = COPY [[DEF]](s32) - ; GFX10NSA: $vgpr2 = COPY [[DEF]](s32) - ; GFX10NSA: $vgpr3 = COPY [[DEF]](s32) + ; GFX10NSA: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF + ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<4 x s32>) + ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) + ; GFX10NSA: $vgpr1 = COPY [[UV1]](s32) + ; GFX10NSA: $vgpr2 = COPY [[UV2]](s32) + ; GFX10NSA: $vgpr3 = COPY [[UV3]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 main_body: %mip = extractelement <2 x i16> %coords, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll index 3063bd4b3bedd7..d9d83883ed0113 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll @@ -508,11 +508,8 @@ define amdgpu_ps half @image_load_f16_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, ; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s16) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") - ; UNPACKED: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[AMDGPU_INTRIN_IMAGE_LOAD]](s16) - ; UNPACKED: $vgpr0 = COPY [[ANYEXT]](s32) + ; UNPACKED: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; UNPACKED: $vgpr0 = COPY [[DEF]](s32) ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; PACKED-LABEL: name: image_load_f16_dmask_0000 ; PACKED: bb.1 (%ir-block.0): @@ -527,11 +524,8 @@ define amdgpu_ps half @image_load_f16_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s16) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8") - ; PACKED: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[AMDGPU_INTRIN_IMAGE_LOAD]](s16) - ; PACKED: $vgpr0 = COPY [[ANYEXT]](s32) + ; PACKED: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; PACKED: $vgpr0 = COPY [[DEF]](s32) ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %tex = call half @llvm.amdgcn.image.load.2d.f16.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret half %tex @@ -600,18 +594,8 @@ define amdgpu_ps <2 x half> @image_load_v2f16_dmask_0000(<8 x i32> inreg %rsrc, ; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") - ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) - ; UNPACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] - ; UNPACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; UNPACKED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNPACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32) - ; UNPACKED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; UNPACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; UNPACKED: $vgpr0 = COPY [[BITCAST]](<2 x s16>) + ; UNPACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; UNPACKED: $vgpr0 = COPY [[DEF]](<2 x s16>) ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; PACKED-LABEL: name: image_load_v2f16_dmask_0000 ; PACKED: bb.1 (%ir-block.0): @@ -626,10 +610,8 @@ define amdgpu_ps <2 x half> @image_load_v2f16_dmask_0000(<8 x i32> inreg %rsrc, ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") - ; PACKED: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s16>) + ; PACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; PACKED: $vgpr0 = COPY [[DEF]](<2 x s16>) ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %tex = call <2 x half> @llvm.amdgcn.image.load.2d.v2f16.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x half> %tex @@ -785,23 +767,10 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_0000(<8 x i32> inreg %rsrc, ; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) - ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) - ; UNPACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] - ; UNPACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; UNPACKED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNPACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32) - ; UNPACKED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; UNPACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; UNPACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C1]], [[SHL]] - ; UNPACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; UNPACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>) - ; UNPACKED: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 ; UNPACKED: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; UNPACKED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[EXTRACT]](<3 x s16>), 0 + ; UNPACKED: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 + ; UNPACKED: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; UNPACKED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 ; UNPACKED: [[EXTRACT1:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 ; UNPACKED: [[EXTRACT2:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 32 ; UNPACKED: $vgpr0 = COPY [[EXTRACT1]](<2 x s16>) @@ -820,18 +789,14 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_0000(<8 x i32> inreg %rsrc, ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) - ; PACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; PACKED: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>) - ; PACKED: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; PACKED: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; PACKED: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; PACKED: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; PACKED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[UV]](<3 x s16>), 0 - ; PACKED: [[EXTRACT:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 - ; PACKED: [[EXTRACT1:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 32 - ; PACKED: $vgpr0 = COPY [[EXTRACT]](<2 x s16>) - ; PACKED: $vgpr1 = COPY [[EXTRACT1]](<2 x s16>) + ; PACKED: [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0 + ; PACKED: [[EXTRACT1:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0 + ; PACKED: [[EXTRACT2:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 32 + ; PACKED: $vgpr0 = COPY [[EXTRACT1]](<2 x s16>) + ; PACKED: $vgpr1 = COPY [[EXTRACT2]](<2 x s16>) ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %tex = call <3 x half> @llvm.amdgcn.image.load.2d.v3f16.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <3 x half> %tex @@ -1023,21 +988,10 @@ define amdgpu_ps <4 x half> @image_load_v4f16_dmask_0000(<8 x i32> inreg %rsrc, ; UNPACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; UNPACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; UNPACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8") - ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) - ; UNPACKED: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] - ; UNPACKED: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; UNPACKED: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNPACKED: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32) - ; UNPACKED: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; UNPACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; UNPACKED: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C1]], [[SHL]] - ; UNPACKED: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) - ; UNPACKED: $vgpr0 = COPY [[BITCAST]](<2 x s16>) - ; UNPACKED: $vgpr1 = COPY [[BITCAST1]](<2 x s16>) + ; UNPACKED: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; UNPACKED: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) + ; UNPACKED: $vgpr0 = COPY [[UV]](<2 x s16>) + ; UNPACKED: $vgpr1 = COPY [[UV1]](<2 x s16>) ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 ; PACKED-LABEL: name: image_load_v4f16_dmask_0000 ; PACKED: bb.1 (%ir-block.0): @@ -1052,12 +1006,10 @@ define amdgpu_ps <4 x half> @image_load_v4f16_dmask_0000(<8 x i32> inreg %rsrc, ; PACKED: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; PACKED: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; PACKED: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8") - ; PACKED: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; PACKED: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s16>) - ; PACKED: $vgpr1 = COPY [[DEF]](<2 x s16>) + ; PACKED: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; PACKED: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) + ; PACKED: $vgpr0 = COPY [[UV]](<2 x s16>) + ; PACKED: $vgpr1 = COPY [[UV1]](<2 x s16>) ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %tex = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x half> %tex @@ -1080,7 +1032,7 @@ define amdgpu_ps half @image_load_tfe_f16_dmask_0000(<8 x i32> inreg %rsrc, i32 ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; UNPACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) @@ -1102,7 +1054,7 @@ define amdgpu_ps half @image_load_tfe_f16_dmask_0000(<8 x i32> inreg %rsrc, i32 ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 2 from custom "TargetCustom8") ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; PACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; PACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) @@ -1191,7 +1143,7 @@ define amdgpu_ps <2 x half> @image_load_tfe_v2f16_dmask_0000(<8 x i32> inreg %rs ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) @@ -1220,7 +1172,7 @@ define amdgpu_ps <2 x half> @image_load_tfe_v2f16_dmask_0000(<8 x i32> inreg %rs ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) ; PACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) @@ -1405,7 +1357,7 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_0000(<8 x i32> inreg %rs ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; UNPACKED: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV]](s32) @@ -1443,7 +1395,7 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_0000(<8 x i32> inreg %rs ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8) ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) ; PACKED: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF @@ -1681,7 +1633,7 @@ define amdgpu_ps <4 x half> @image_load_tfe_v4f16_dmask_0000(<8 x i32> inreg %rs ; UNPACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; UNPACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; UNPACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 8 from custom "TargetCustom8") + ; UNPACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 8 from custom "TargetCustom8") ; UNPACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; UNPACKED: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; UNPACKED: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 @@ -1713,7 +1665,7 @@ define amdgpu_ps <4 x half> @image_load_tfe_v4f16_dmask_0000(<8 x i32> inreg %rs ; PACKED: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; PACKED: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; PACKED: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 8 from custom "TargetCustom8") + ; PACKED: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 8 from custom "TargetCustom8") ; PACKED: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; PACKED: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[UV]](s32) ; PACKED: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll index e37860f712e223..e7a8ca40c02a04 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll @@ -238,10 +238,8 @@ define amdgpu_ps float @image_load_f32_dmask_0000(<8 x i32> inreg %rsrc, i32 %s, ; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8") - ; GCN: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) + ; GCN: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GCN: $vgpr0 = COPY [[DEF]](s32) ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0 %tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret float %tex @@ -286,12 +284,10 @@ define amdgpu_ps <2 x float> @image_load_v2f32_dmask_0000(<8 x i32> inreg %rsrc, ; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8") - ; GCN: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) - ; GCN: $vgpr1 = COPY [[DEF]](s32) + ; GCN: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF + ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<2 x s32>) + ; GCN: $vgpr0 = COPY [[UV]](s32) + ; GCN: $vgpr1 = COPY [[UV1]](s32) ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 %tex = call <2 x float> @llvm.amdgcn.image.load.2d.v2f32.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x float> %tex @@ -364,13 +360,11 @@ define amdgpu_ps <3 x float> @image_load_v3f32_dmask_0000(<8 x i32> inreg %rsrc, ; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 12 from custom "TargetCustom8", align 16) - ; GCN: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) - ; GCN: $vgpr1 = COPY [[DEF]](s32) - ; GCN: $vgpr2 = COPY [[DEF]](s32) + ; GCN: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<3 x s32>) + ; GCN: $vgpr0 = COPY [[UV]](s32) + ; GCN: $vgpr1 = COPY [[UV1]](s32) + ; GCN: $vgpr2 = COPY [[UV2]](s32) ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 %tex = call <3 x float> @llvm.amdgcn.image.load.2d.v3f32.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <3 x float> %tex @@ -473,14 +467,12 @@ define amdgpu_ps <4 x float> @image_load_v4f32_dmask_0000(<8 x i32> inreg %rsrc, ; GCN: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GCN: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8") - ; GCN: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GCN: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) - ; GCN: $vgpr1 = COPY [[DEF]](s32) - ; GCN: $vgpr2 = COPY [[DEF]](s32) - ; GCN: $vgpr3 = COPY [[DEF]](s32) + ; GCN: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF + ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<4 x s32>) + ; GCN: $vgpr0 = COPY [[UV]](s32) + ; GCN: $vgpr1 = COPY [[UV1]](s32) + ; GCN: $vgpr2 = COPY [[UV2]](s32) + ; GCN: $vgpr3 = COPY [[UV3]](s32) ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 %tex = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 0, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %tex @@ -503,7 +495,7 @@ define amdgpu_ps float @image_load_tfe_f32_dmask_0000(<8 x i32> inreg %rsrc, i32 ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 4 from custom "TargetCustom8") ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GCN: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; GCN: $vgpr0 = COPY [[UV]](s32) @@ -563,7 +555,7 @@ define amdgpu_ps <2 x float> @image_load_tfe_v2f32_dmask_0000(<8 x i32> inreg %r ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 8 from custom "TargetCustom8") + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 8 from custom "TargetCustom8") ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GCN: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GCN: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) @@ -658,7 +650,7 @@ define amdgpu_ps <3 x float> @image_load_tfe_v3f32_dmask_0000(<8 x i32> inreg %r ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 12 from custom "TargetCustom8", align 16) + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 12 from custom "TargetCustom8", align 16) ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GCN: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GCN: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) @@ -789,7 +781,7 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_0000(<8 x i32> inreg %r ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; GCN: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 0, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 16 from custom "TargetCustom8") + ; GCN: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0 :: (dereferenceable load 16 from custom "TargetCustom8") ; GCN: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GCN: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GCN: G_STORE [[UV1]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll new file mode 100644 index 00000000000000..54812462799722 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -0,0 +1,201 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -amdgpu-fixed-function-abi -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +@gv.fptr0 = external hidden unnamed_addr addrspace(4) constant void()*, align 4 +@gv.fptr1 = external hidden unnamed_addr addrspace(4) constant void(i32)*, align 4 + +define amdgpu_kernel void @test_indirect_call_sgpr_ptr() { +; GCN-LABEL: test_indirect_call_sgpr_ptr: +; GCN: .amd_kernel_code_t +; GCN-NEXT: amd_code_version_major = 1 +; GCN-NEXT: amd_code_version_minor = 2 +; GCN-NEXT: amd_machine_kind = 1 +; GCN-NEXT: amd_machine_version_major = 7 +; GCN-NEXT: amd_machine_version_minor = 0 +; GCN-NEXT: amd_machine_version_stepping = 0 +; GCN-NEXT: kernel_code_entry_byte_offset = 256 +; GCN-NEXT: kernel_code_prefetch_byte_size = 0 +; GCN-NEXT: granulated_workitem_vgpr_count = 7 +; GCN-NEXT: granulated_wavefront_sgpr_count = 5 +; GCN-NEXT: priority = 0 +; GCN-NEXT: float_mode = 192 +; GCN-NEXT: priv = 0 +; GCN-NEXT: enable_dx10_clamp = 1 +; GCN-NEXT: debug_mode = 0 +; GCN-NEXT: enable_ieee_mode = 1 +; GCN-NEXT: enable_wgp_mode = 0 +; GCN-NEXT: enable_mem_ordered = 0 +; GCN-NEXT: enable_fwd_progress = 0 +; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 1 +; GCN-NEXT: user_sgpr_count = 14 +; GCN-NEXT: enable_trap_handler = 0 +; GCN-NEXT: enable_sgpr_workgroup_id_x = 1 +; GCN-NEXT: enable_sgpr_workgroup_id_y = 1 +; GCN-NEXT: enable_sgpr_workgroup_id_z = 1 +; GCN-NEXT: enable_sgpr_workgroup_info = 0 +; GCN-NEXT: enable_vgpr_workitem_id = 2 +; GCN-NEXT: enable_exception_msb = 0 +; GCN-NEXT: granulated_lds_size = 0 +; GCN-NEXT: enable_exception = 0 +; GCN-NEXT: enable_sgpr_private_segment_buffer = 1 +; GCN-NEXT: enable_sgpr_dispatch_ptr = 1 +; GCN-NEXT: enable_sgpr_queue_ptr = 1 +; GCN-NEXT: enable_sgpr_kernarg_segment_ptr = 1 +; GCN-NEXT: enable_sgpr_dispatch_id = 1 +; GCN-NEXT: enable_sgpr_flat_scratch_init = 1 +; GCN-NEXT: enable_sgpr_private_segment_size = 0 +; GCN-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; GCN-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; GCN-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; GCN-NEXT: enable_wavefront_size32 = 0 +; GCN-NEXT: enable_ordered_append_gds = 0 +; GCN-NEXT: private_element_size = 1 +; GCN-NEXT: is_ptr64 = 1 +; GCN-NEXT: is_dynamic_callstack = 1 +; GCN-NEXT: is_debug_enabled = 0 +; GCN-NEXT: is_xnack_enabled = 1 +; GCN-NEXT: workitem_private_segment_byte_size = 16384 +; GCN-NEXT: workgroup_group_segment_byte_size = 0 +; GCN-NEXT: gds_segment_byte_size = 0 +; GCN-NEXT: kernarg_segment_byte_size = 0 +; GCN-NEXT: workgroup_fbarrier_count = 0 +; GCN-NEXT: wavefront_sgpr_count = 48 +; GCN-NEXT: workitem_vgpr_count = 32 +; GCN-NEXT: reserved_vgpr_first = 0 +; GCN-NEXT: reserved_vgpr_count = 0 +; GCN-NEXT: reserved_sgpr_first = 0 +; GCN-NEXT: reserved_sgpr_count = 0 +; GCN-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; GCN-NEXT: debug_private_segment_buffer_sgpr = 0 +; GCN-NEXT: kernarg_segment_alignment = 4 +; GCN-NEXT: group_segment_alignment = 4 +; GCN-NEXT: private_segment_alignment = 4 +; GCN-NEXT: wavefront_size = 6 +; GCN-NEXT: call_convention = -1 +; GCN-NEXT: runtime_loader_kernel_symbol = 0 +; GCN-NEXT: .end_amd_kernel_code_t +; GCN-NEXT: ; %bb.0: +; GCN-NEXT: s_mov_b32 s33, s17 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_add_u32 s12, s12, s33 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, gv.fptr0@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, gv.fptr0@rel32@hi+4 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_endpgm + %fptr = load void()*, void()* addrspace(4)* @gv.fptr0 + call void %fptr() + ret void +} + +define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg() { +; GCN-LABEL: test_indirect_call_sgpr_ptr_arg: +; GCN: .amd_kernel_code_t +; GCN-NEXT: amd_code_version_major = 1 +; GCN-NEXT: amd_code_version_minor = 2 +; GCN-NEXT: amd_machine_kind = 1 +; GCN-NEXT: amd_machine_version_major = 7 +; GCN-NEXT: amd_machine_version_minor = 0 +; GCN-NEXT: amd_machine_version_stepping = 0 +; GCN-NEXT: kernel_code_entry_byte_offset = 256 +; GCN-NEXT: kernel_code_prefetch_byte_size = 0 +; GCN-NEXT: granulated_workitem_vgpr_count = 7 +; GCN-NEXT: granulated_wavefront_sgpr_count = 5 +; GCN-NEXT: priority = 0 +; GCN-NEXT: float_mode = 192 +; GCN-NEXT: priv = 0 +; GCN-NEXT: enable_dx10_clamp = 1 +; GCN-NEXT: debug_mode = 0 +; GCN-NEXT: enable_ieee_mode = 1 +; GCN-NEXT: enable_wgp_mode = 0 +; GCN-NEXT: enable_mem_ordered = 0 +; GCN-NEXT: enable_fwd_progress = 0 +; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 1 +; GCN-NEXT: user_sgpr_count = 14 +; GCN-NEXT: enable_trap_handler = 0 +; GCN-NEXT: enable_sgpr_workgroup_id_x = 1 +; GCN-NEXT: enable_sgpr_workgroup_id_y = 1 +; GCN-NEXT: enable_sgpr_workgroup_id_z = 1 +; GCN-NEXT: enable_sgpr_workgroup_info = 0 +; GCN-NEXT: enable_vgpr_workitem_id = 2 +; GCN-NEXT: enable_exception_msb = 0 +; GCN-NEXT: granulated_lds_size = 0 +; GCN-NEXT: enable_exception = 0 +; GCN-NEXT: enable_sgpr_private_segment_buffer = 1 +; GCN-NEXT: enable_sgpr_dispatch_ptr = 1 +; GCN-NEXT: enable_sgpr_queue_ptr = 1 +; GCN-NEXT: enable_sgpr_kernarg_segment_ptr = 1 +; GCN-NEXT: enable_sgpr_dispatch_id = 1 +; GCN-NEXT: enable_sgpr_flat_scratch_init = 1 +; GCN-NEXT: enable_sgpr_private_segment_size = 0 +; GCN-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; GCN-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; GCN-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; GCN-NEXT: enable_wavefront_size32 = 0 +; GCN-NEXT: enable_ordered_append_gds = 0 +; GCN-NEXT: private_element_size = 1 +; GCN-NEXT: is_ptr64 = 1 +; GCN-NEXT: is_dynamic_callstack = 1 +; GCN-NEXT: is_debug_enabled = 0 +; GCN-NEXT: is_xnack_enabled = 1 +; GCN-NEXT: workitem_private_segment_byte_size = 16384 +; GCN-NEXT: workgroup_group_segment_byte_size = 0 +; GCN-NEXT: gds_segment_byte_size = 0 +; GCN-NEXT: kernarg_segment_byte_size = 0 +; GCN-NEXT: workgroup_fbarrier_count = 0 +; GCN-NEXT: wavefront_sgpr_count = 48 +; GCN-NEXT: workitem_vgpr_count = 32 +; GCN-NEXT: reserved_vgpr_first = 0 +; GCN-NEXT: reserved_vgpr_count = 0 +; GCN-NEXT: reserved_sgpr_first = 0 +; GCN-NEXT: reserved_sgpr_count = 0 +; GCN-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; GCN-NEXT: debug_private_segment_buffer_sgpr = 0 +; GCN-NEXT: kernarg_segment_alignment = 4 +; GCN-NEXT: group_segment_alignment = 4 +; GCN-NEXT: private_segment_alignment = 4 +; GCN-NEXT: wavefront_size = 6 +; GCN-NEXT: call_convention = -1 +; GCN-NEXT: runtime_loader_kernel_symbol = 0 +; GCN-NEXT: .end_amd_kernel_code_t +; GCN-NEXT: ; %bb.0: +; GCN-NEXT: s_mov_b32 s33, s17 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_add_u32 s12, s12, s33 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, gv.fptr1@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, gv.fptr1@rel32@hi+4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: v_mov_b32_e32 v0, 0x7b +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_endpgm + %fptr = load void(i32)*, void(i32)* addrspace(4)* @gv.fptr1 + call void %fptr(i32 123) + ret void +} + +; FIXME +; define void @test_indirect_call_vgpr_ptr(void()* %fptr) { +; call void %fptr() +; ret void +; } + +; define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) { +; call void %fptr(i32 123) +; ret void +; } diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll index c09d18e104f79e..d473146d1cdda4 100644 --- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -729,15 +729,18 @@ bb5: ; preds = %bb3 ; IR: [[FLOW]]: ; IR-NEXT: phi -; IR-NEXT: br i1 [[CND2:%.*]], label %[[PREHEADER:.*]], label %[[FLOW2:.*]] +; IR-NEXT: br i1 [[CND2:%.*]], label %[[LOOP:.*]], label %UnifiedReturnBlock -; IR: [[FLOW2]]: -; IR-NEXT: br label %UnifiedReturnBlock +; IR: [[LOOP]]: +; IR-NEXT: br i1 false, label %[[FLOW1:.*]], label %[[LOOP]] ; IR: [[EXP]]: ; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg false, i1 immarg true) ; IR-NEXT: br label %[[FLOW]] +; IR: [[FLOW1]]: +; IR-NEXT: br label %UnifiedReturnBlock + ; IR: UnifiedReturnBlock: ; IR-NEXT: call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true) ; IR-NEXT: ret void @@ -745,13 +748,10 @@ bb5: ; preds = %bb3 define amdgpu_ps void @uniformly_reached_export(float inreg %tmp25) { .entry: %tmp26 = fcmp olt float %tmp25, 0.000000e+00 - br i1 %tmp26, label %.preheader.1, label %bb27 - -.preheader.1: ; preds = %.entry - br label %bb + br i1 %tmp26, label %loop, label %bb27 -bb: ; preds = %bb, %.preheader.1 - br label %bb +loop: ; preds = %loop, %.entry + br label %loop bb27: ; preds = %.entry call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) diff --git a/llvm/test/CodeGen/ARM/indvar-cost.ll b/llvm/test/CodeGen/ARM/indvar-cost.ll new file mode 100644 index 00000000000000..df4c71777b964c --- /dev/null +++ b/llvm/test/CodeGen/ARM/indvar-cost.ll @@ -0,0 +1,514 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -indvars -mtriple=thumbv8m.base -S %s -o - | FileCheck %s --check-prefix=CHECK-T1 +; RUN: opt -indvars -mtriple=thumbv8m.main -S %s -o - | FileCheck %s --check-prefix=CHECK-T2 + +define dso_local arm_aapcscc void @arm_conv_fast_q15(i16* %pSrcA, i32 %srcALen, i16* %pSrcB, i32 %srcBLen, i16* %pDst, i16** %store.px, i16** %store.py, i32* %store.res) local_unnamed_addr { +; CHECK-T1-LABEL: @arm_conv_fast_q15( +; CHECK-T1-NEXT: entry: +; CHECK-T1-NEXT: [[CMP:%.*]] = icmp ult i32 [[SRCALEN:%.*]], [[SRCBLEN:%.*]] +; CHECK-T1-NEXT: [[SRCALEN_SRCBLEN:%.*]] = select i1 [[CMP]], i32 [[SRCALEN]], i32 [[SRCBLEN]] +; CHECK-T1-NEXT: [[PSRCB_PSRCA:%.*]] = select i1 [[CMP]], i16* [[PSRCB:%.*]], i16* [[PSRCA:%.*]] +; CHECK-T1-NEXT: [[PSRCA_PSRCB:%.*]] = select i1 [[CMP]], i16* [[PSRCA]], i16* [[PSRCB]] +; CHECK-T1-NEXT: [[SUB:%.*]] = add i32 [[SRCALEN_SRCBLEN]], -1 +; CHECK-T1-NEXT: [[CMP41080:%.*]] = icmp eq i32 [[SUB]], 0 +; CHECK-T1-NEXT: br i1 [[CMP41080]], label [[WHILE_END13:%.*]], label [[WHILE_COND5_PREHEADER_PREHEADER:%.*]] +; CHECK-T1: while.cond5.preheader.preheader: +; CHECK-T1-NEXT: [[TMP0:%.*]] = add i32 [[SRCALEN_SRCBLEN]], -2 +; CHECK-T1-NEXT: [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 2 +; CHECK-T1-NEXT: [[UMIN:%.*]] = select i1 [[TMP1]], i32 [[TMP0]], i32 2 +; CHECK-T1-NEXT: br label [[WHILE_COND5_PREHEADER:%.*]] +; CHECK-T1: while.cond5.preheader: +; CHECK-T1-NEXT: [[COUNT_01084:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_END:%.*]] ], [ 1, [[WHILE_COND5_PREHEADER_PREHEADER]] ] +; CHECK-T1-NEXT: [[BLOCKSIZE1_01083:%.*]] = phi i32 [ [[DEC12:%.*]], [[WHILE_END]] ], [ [[SUB]], [[WHILE_COND5_PREHEADER_PREHEADER]] ] +; CHECK-T1-NEXT: [[PY_01082:%.*]] = phi i16* [ [[ADD_PTR:%.*]], [[WHILE_END]] ], [ [[PSRCA_PSRCB]], [[WHILE_COND5_PREHEADER_PREHEADER]] ] +; CHECK-T1-NEXT: [[POUT_01081:%.*]] = phi i16* [ [[INCDEC_PTR11:%.*]], [[WHILE_END]] ], [ [[PDST:%.*]], [[WHILE_COND5_PREHEADER_PREHEADER]] ] +; CHECK-T1-NEXT: br label [[WHILE_BODY7:%.*]] +; CHECK-T1: while.body7: +; CHECK-T1-NEXT: [[K_01078:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY7]] ], [ [[COUNT_01084]], [[WHILE_COND5_PREHEADER]] ] +; CHECK-T1-NEXT: [[SUM_01077:%.*]] = phi i32 [ [[ADD6_I:%.*]], [[WHILE_BODY7]] ], [ 0, [[WHILE_COND5_PREHEADER]] ] +; CHECK-T1-NEXT: [[PY_11076:%.*]] = phi i16* [ [[INCDEC_PTR8:%.*]], [[WHILE_BODY7]] ], [ [[PY_01082]], [[WHILE_COND5_PREHEADER]] ] +; CHECK-T1-NEXT: [[PX_11075:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY7]] ], [ [[PSRCB_PSRCA]], [[WHILE_COND5_PREHEADER]] ] +; CHECK-T1-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[PX_11075]], i32 1 +; CHECK-T1-NEXT: [[TMP2:%.*]] = load i16, i16* [[PX_11075]], align 2 +; CHECK-T1-NEXT: [[CONV:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-T1-NEXT: [[INCDEC_PTR8]] = getelementptr inbounds i16, i16* [[PY_11076]], i32 -1 +; CHECK-T1-NEXT: [[TMP3:%.*]] = load i16, i16* [[PY_11076]], align 2 +; CHECK-T1-NEXT: [[CONV9:%.*]] = sext i16 [[TMP3]] to i32 +; CHECK-T1-NEXT: [[MUL_I:%.*]] = mul nsw i32 [[CONV9]], [[CONV]] +; CHECK-T1-NEXT: [[SHR3_I:%.*]] = ashr i32 [[CONV]], 16 +; CHECK-T1-NEXT: [[SHR4_I:%.*]] = ashr i32 [[CONV9]], 16 +; CHECK-T1-NEXT: [[MUL5_I:%.*]] = mul nsw i32 [[SHR4_I]], [[SHR3_I]] +; CHECK-T1-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[SUM_01077]] +; CHECK-T1-NEXT: [[ADD6_I]] = add i32 [[ADD_I]], [[MUL5_I]] +; CHECK-T1-NEXT: [[DEC]] = add nsw i32 [[K_01078]], -1 +; CHECK-T1-NEXT: [[CMP6:%.*]] = icmp eq i32 [[DEC]], 0 +; CHECK-T1-NEXT: br i1 [[CMP6]], label [[WHILE_END]], label [[WHILE_BODY7]] +; CHECK-T1: while.end: +; CHECK-T1-NEXT: [[ADD6_I_LCSSA:%.*]] = phi i32 [ [[ADD6_I]], [[WHILE_BODY7]] ] +; CHECK-T1-NEXT: [[TMP4:%.*]] = lshr i32 [[ADD6_I_LCSSA]], 15 +; CHECK-T1-NEXT: [[CONV10:%.*]] = trunc i32 [[TMP4]] to i16 +; CHECK-T1-NEXT: [[INCDEC_PTR11]] = getelementptr inbounds i16, i16* [[POUT_01081]], i32 1 +; CHECK-T1-NEXT: store i16 [[CONV10]], i16* [[POUT_01081]], align 2 +; CHECK-T1-NEXT: [[ADD_PTR]] = getelementptr inbounds i16, i16* [[PSRCA_PSRCB]], i32 [[COUNT_01084]] +; CHECK-T1-NEXT: [[INC]] = add nuw nsw i32 [[COUNT_01084]], 1 +; CHECK-T1-NEXT: [[DEC12]] = add i32 [[BLOCKSIZE1_01083]], -1 +; CHECK-T1-NEXT: [[CMP3:%.*]] = icmp ult i32 [[COUNT_01084]], 3 +; CHECK-T1-NEXT: [[CMP4:%.*]] = icmp ne i32 [[DEC12]], 0 +; CHECK-T1-NEXT: [[TMP5:%.*]] = and i1 [[CMP4]], [[CMP3]] +; CHECK-T1-NEXT: br i1 [[TMP5]], label [[WHILE_COND5_PREHEADER]], label [[WHILE_END13_LOOPEXIT:%.*]] +; CHECK-T1: while.end13.loopexit: +; CHECK-T1-NEXT: [[INCDEC_PTR11_LCSSA:%.*]] = phi i16* [ [[INCDEC_PTR11]], [[WHILE_END]] ] +; CHECK-T1-NEXT: [[ADD_PTR_LCSSA:%.*]] = phi i16* [ [[ADD_PTR]], [[WHILE_END]] ] +; CHECK-T1-NEXT: [[DEC12_LCSSA:%.*]] = phi i32 [ [[DEC12]], [[WHILE_END]] ] +; CHECK-T1-NEXT: [[TMP6:%.*]] = add nuw nsw i32 [[UMIN]], 2 +; CHECK-T1-NEXT: br label [[WHILE_END13]] +; CHECK-T1: while.end13: +; CHECK-T1-NEXT: [[POUT_0_LCSSA:%.*]] = phi i16* [ [[PDST]], [[ENTRY:%.*]] ], [ [[INCDEC_PTR11_LCSSA]], [[WHILE_END13_LOOPEXIT]] ] +; CHECK-T1-NEXT: [[PY_0_LCSSA:%.*]] = phi i16* [ [[PSRCA_PSRCB]], [[ENTRY]] ], [ [[ADD_PTR_LCSSA]], [[WHILE_END13_LOOPEXIT]] ] +; CHECK-T1-NEXT: [[BLOCKSIZE1_0_LCSSA:%.*]] = phi i32 [ [[SUB]], [[ENTRY]] ], [ [[DEC12_LCSSA]], [[WHILE_END13_LOOPEXIT]] ] +; CHECK-T1-NEXT: [[COUNT_0_LCSSA:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[TMP6]], [[WHILE_END13_LOOPEXIT]] ] +; CHECK-T1-NEXT: [[CMP161068:%.*]] = icmp eq i32 [[BLOCKSIZE1_0_LCSSA]], 0 +; CHECK-T1-NEXT: br i1 [[CMP161068]], label [[EXIT:%.*]], label [[WHILE_BODY18_PREHEADER:%.*]] +; CHECK-T1: while.body18.preheader: +; CHECK-T1-NEXT: [[ADD_PTR14:%.*]] = getelementptr inbounds i16, i16* [[PY_0_LCSSA]], i32 -1 +; CHECK-T1-NEXT: br label [[WHILE_BODY18:%.*]] +; CHECK-T1: while.body18: +; CHECK-T1-NEXT: [[COUNT_11072:%.*]] = phi i32 [ [[INC49:%.*]], [[WHILE_END43:%.*]] ], [ [[COUNT_0_LCSSA]], [[WHILE_BODY18_PREHEADER]] ] +; CHECK-T1-NEXT: [[BLOCKSIZE1_11071:%.*]] = phi i32 [ [[DEC50:%.*]], [[WHILE_END43]] ], [ [[BLOCKSIZE1_0_LCSSA]], [[WHILE_BODY18_PREHEADER]] ] +; CHECK-T1-NEXT: [[PY_21070:%.*]] = phi i16* [ [[ADD_PTR48:%.*]], [[WHILE_END43]] ], [ [[ADD_PTR14]], [[WHILE_BODY18_PREHEADER]] ] +; CHECK-T1-NEXT: [[POUT_11069:%.*]] = phi i16* [ [[INCDEC_PTR46:%.*]], [[WHILE_END43]] ], [ [[POUT_0_LCSSA]], [[WHILE_BODY18_PREHEADER]] ] +; CHECK-T1-NEXT: [[SHR19:%.*]] = lshr i32 [[COUNT_11072]], 2 +; CHECK-T1-NEXT: [[CMP211054:%.*]] = icmp eq i32 [[SHR19]], 0 +; CHECK-T1-NEXT: br i1 [[CMP211054]], label [[WHILE_END31:%.*]], label [[WHILE_BODY23_PREHEADER:%.*]] +; CHECK-T1: while.body23.preheader: +; CHECK-T1-NEXT: br label [[WHILE_BODY23:%.*]] +; CHECK-T1: while.body23: +; CHECK-T1-NEXT: [[K_11058:%.*]] = phi i32 [ [[DEC30:%.*]], [[WHILE_BODY23]] ], [ [[SHR19]], [[WHILE_BODY23_PREHEADER]] ] +; CHECK-T1-NEXT: [[SUM_11057:%.*]] = phi i32 [ [[ADD6_I878:%.*]], [[WHILE_BODY23]] ], [ 0, [[WHILE_BODY23_PREHEADER]] ] +; CHECK-T1-NEXT: [[PY_31056:%.*]] = phi i16* [ [[ADD_PTR_I884:%.*]], [[WHILE_BODY23]] ], [ [[PY_21070]], [[WHILE_BODY23_PREHEADER]] ] +; CHECK-T1-NEXT: [[PX_31055:%.*]] = phi i16* [ [[ADD_PTR_I890:%.*]], [[WHILE_BODY23]] ], [ [[PSRCB_PSRCA]], [[WHILE_BODY23_PREHEADER]] ] +; CHECK-T1-NEXT: [[ARRAYIDX_I907:%.*]] = getelementptr inbounds i16, i16* [[PX_31055]], i32 1 +; CHECK-T1-NEXT: [[TMP7:%.*]] = load i16, i16* [[ARRAYIDX_I907]], align 2 +; CHECK-T1-NEXT: [[TMP8:%.*]] = load i16, i16* [[PX_31055]], align 2 +; CHECK-T1-NEXT: [[ADD_PTR_I912:%.*]] = getelementptr inbounds i16, i16* [[PX_31055]], i32 2 +; CHECK-T1-NEXT: [[ARRAYIDX_I901:%.*]] = getelementptr inbounds i16, i16* [[PY_31056]], i32 1 +; CHECK-T1-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX_I901]], align 2 +; CHECK-T1-NEXT: [[TMP10:%.*]] = load i16, i16* [[PY_31056]], align 2 +; CHECK-T1-NEXT: [[ADD_PTR_I906:%.*]] = getelementptr inbounds i16, i16* [[PY_31056]], i32 -2 +; CHECK-T1-NEXT: [[SHR_I892:%.*]] = sext i16 [[TMP8]] to i32 +; CHECK-T1-NEXT: [[SHR1_I893:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-T1-NEXT: [[MUL_I894:%.*]] = mul nsw i32 [[SHR1_I893]], [[SHR_I892]] +; CHECK-T1-NEXT: [[SHR2_I895:%.*]] = sext i16 [[TMP7]] to i32 +; CHECK-T1-NEXT: [[SHR4_I897:%.*]] = sext i16 [[TMP10]] to i32 +; CHECK-T1-NEXT: [[MUL5_I898:%.*]] = mul nsw i32 [[SHR4_I897]], [[SHR2_I895]] +; CHECK-T1-NEXT: [[ADD_I899:%.*]] = add i32 [[MUL_I894]], [[SUM_11057]] +; CHECK-T1-NEXT: [[ADD6_I900:%.*]] = add i32 [[ADD_I899]], [[MUL5_I898]] +; CHECK-T1-NEXT: [[ARRAYIDX_I885:%.*]] = getelementptr inbounds i16, i16* [[PX_31055]], i32 3 +; CHECK-T1-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX_I885]], align 2 +; CHECK-T1-NEXT: [[TMP12:%.*]] = load i16, i16* [[ADD_PTR_I912]], align 2 +; CHECK-T1-NEXT: [[ADD_PTR_I890]] = getelementptr inbounds i16, i16* [[PX_31055]], i32 4 +; CHECK-T1-NEXT: [[ARRAYIDX_I879:%.*]] = getelementptr inbounds i16, i16* [[PY_31056]], i32 -1 +; CHECK-T1-NEXT: [[TMP13:%.*]] = load i16, i16* [[ARRAYIDX_I879]], align 2 +; CHECK-T1-NEXT: [[TMP14:%.*]] = load i16, i16* [[ADD_PTR_I906]], align 2 +; CHECK-T1-NEXT: [[ADD_PTR_I884]] = getelementptr inbounds i16, i16* [[PY_31056]], i32 -4 +; CHECK-T1-NEXT: [[SHR_I870:%.*]] = sext i16 [[TMP12]] to i32 +; CHECK-T1-NEXT: [[SHR1_I871:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-T1-NEXT: [[MUL_I872:%.*]] = mul nsw i32 [[SHR1_I871]], [[SHR_I870]] +; CHECK-T1-NEXT: [[SHR2_I873:%.*]] = sext i16 [[TMP11]] to i32 +; CHECK-T1-NEXT: [[SHR4_I875:%.*]] = sext i16 [[TMP14]] to i32 +; CHECK-T1-NEXT: [[MUL5_I876:%.*]] = mul nsw i32 [[SHR4_I875]], [[SHR2_I873]] +; CHECK-T1-NEXT: [[ADD_I877:%.*]] = add i32 [[ADD6_I900]], [[MUL_I872]] +; CHECK-T1-NEXT: [[ADD6_I878]] = add i32 [[ADD_I877]], [[MUL5_I876]] +; CHECK-T1-NEXT: [[DEC30]] = add nsw i32 [[K_11058]], -1 +; CHECK-T1-NEXT: [[CMP21:%.*]] = icmp eq i32 [[DEC30]], 0 +; CHECK-T1-NEXT: br i1 [[CMP21]], label [[WHILE_END31_LOOPEXIT:%.*]], label [[WHILE_BODY23]] +; CHECK-T1: while.end31.loopexit: +; CHECK-T1-NEXT: [[ADD_PTR_I890_LCSSA:%.*]] = phi i16* [ [[ADD_PTR_I890]], [[WHILE_BODY23]] ] +; CHECK-T1-NEXT: [[ADD_PTR_I884_LCSSA:%.*]] = phi i16* [ [[ADD_PTR_I884]], [[WHILE_BODY23]] ] +; CHECK-T1-NEXT: [[ADD6_I878_LCSSA:%.*]] = phi i32 [ [[ADD6_I878]], [[WHILE_BODY23]] ] +; CHECK-T1-NEXT: br label [[WHILE_END31]] +; CHECK-T1: while.end31: +; CHECK-T1-NEXT: [[PX_3_LCSSA:%.*]] = phi i16* [ [[PSRCB_PSRCA]], [[WHILE_BODY18]] ], [ [[ADD_PTR_I890_LCSSA]], [[WHILE_END31_LOOPEXIT]] ] +; CHECK-T1-NEXT: [[PY_3_LCSSA:%.*]] = phi i16* [ [[PY_21070]], [[WHILE_BODY18]] ], [ [[ADD_PTR_I884_LCSSA]], [[WHILE_END31_LOOPEXIT]] ] +; CHECK-T1-NEXT: [[SUM_1_LCSSA:%.*]] = phi i32 [ 0, [[WHILE_BODY18]] ], [ [[ADD6_I878_LCSSA]], [[WHILE_END31_LOOPEXIT]] ] +; CHECK-T1-NEXT: [[REM:%.*]] = and i32 [[COUNT_11072]], 3 +; CHECK-T1-NEXT: [[CMP341062:%.*]] = icmp eq i32 [[REM]], 0 +; CHECK-T1-NEXT: br i1 [[CMP341062]], label [[WHILE_END43]], label [[WHILE_BODY36_PREHEADER:%.*]] +; CHECK-T1: while.body36.preheader: +; CHECK-T1-NEXT: [[ADD_PTR32:%.*]] = getelementptr inbounds i16, i16* [[PY_3_LCSSA]], i32 1 +; CHECK-T1-NEXT: br label [[WHILE_BODY36:%.*]] +; CHECK-T1: while.body36: +; CHECK-T1-NEXT: [[K_21066:%.*]] = phi i32 [ [[DEC42:%.*]], [[WHILE_BODY36]] ], [ [[REM]], [[WHILE_BODY36_PREHEADER]] ] +; CHECK-T1-NEXT: [[SUM_21065:%.*]] = phi i32 [ [[ADD6_I868:%.*]], [[WHILE_BODY36]] ], [ [[SUM_1_LCSSA]], [[WHILE_BODY36_PREHEADER]] ] +; CHECK-T1-NEXT: [[PY_41064:%.*]] = phi i16* [ [[INCDEC_PTR39:%.*]], [[WHILE_BODY36]] ], [ [[ADD_PTR32]], [[WHILE_BODY36_PREHEADER]] ] +; CHECK-T1-NEXT: [[PX_41063:%.*]] = phi i16* [ [[INCDEC_PTR37:%.*]], [[WHILE_BODY36]] ], [ [[PX_3_LCSSA]], [[WHILE_BODY36_PREHEADER]] ] +; CHECK-T1-NEXT: [[INCDEC_PTR37]] = getelementptr inbounds i16, i16* [[PX_41063]], i32 1 +; CHECK-T1-NEXT: [[TMP15:%.*]] = load i16, i16* [[PX_41063]], align 2 +; CHECK-T1-NEXT: [[CONV38:%.*]] = sext i16 [[TMP15]] to i32 +; CHECK-T1-NEXT: [[INCDEC_PTR39]] = getelementptr inbounds i16, i16* [[PY_41064]], i32 -1 +; CHECK-T1-NEXT: [[TMP16:%.*]] = load i16, i16* [[PY_41064]], align 2 +; CHECK-T1-NEXT: [[CONV40:%.*]] = sext i16 [[TMP16]] to i32 +; CHECK-T1-NEXT: [[MUL_I863:%.*]] = mul nsw i32 [[CONV40]], [[CONV38]] +; CHECK-T1-NEXT: [[SHR3_I864:%.*]] = ashr i32 [[CONV38]], 16 +; CHECK-T1-NEXT: [[SHR4_I865:%.*]] = ashr i32 [[CONV40]], 16 +; CHECK-T1-NEXT: [[MUL5_I866:%.*]] = mul nsw i32 [[SHR4_I865]], [[SHR3_I864]] +; CHECK-T1-NEXT: [[ADD_I867:%.*]] = add i32 [[MUL_I863]], [[SUM_21065]] +; CHECK-T1-NEXT: [[ADD6_I868]] = add i32 [[ADD_I867]], [[MUL5_I866]] +; CHECK-T1-NEXT: [[DEC42]] = add nsw i32 [[K_21066]], -1 +; CHECK-T1-NEXT: [[CMP34:%.*]] = icmp eq i32 [[DEC42]], 0 +; CHECK-T1-NEXT: br i1 [[CMP34]], label [[WHILE_END43_LOOPEXIT:%.*]], label [[WHILE_BODY36]] +; CHECK-T1: while.end43.loopexit: +; CHECK-T1-NEXT: [[ADD6_I868_LCSSA:%.*]] = phi i32 [ [[ADD6_I868]], [[WHILE_BODY36]] ] +; CHECK-T1-NEXT: br label [[WHILE_END43]] +; CHECK-T1: while.end43: +; CHECK-T1-NEXT: [[SUM_2_LCSSA:%.*]] = phi i32 [ [[SUM_1_LCSSA]], [[WHILE_END31]] ], [ [[ADD6_I868_LCSSA]], [[WHILE_END43_LOOPEXIT]] ] +; CHECK-T1-NEXT: [[TMP17:%.*]] = lshr i32 [[SUM_2_LCSSA]], 15 +; CHECK-T1-NEXT: [[CONV45:%.*]] = trunc i32 [[TMP17]] to i16 +; CHECK-T1-NEXT: [[INCDEC_PTR46]] = getelementptr inbounds i16, i16* [[POUT_11069]], i32 1 +; CHECK-T1-NEXT: store i16 [[CONV45]], i16* [[POUT_11069]], align 2 +; CHECK-T1-NEXT: [[SUB47:%.*]] = add i32 [[COUNT_11072]], -1 +; CHECK-T1-NEXT: [[ADD_PTR48]] = getelementptr inbounds i16, i16* [[PSRCA_PSRCB]], i32 [[SUB47]] +; CHECK-T1-NEXT: [[INC49]] = add i32 [[COUNT_11072]], 1 +; CHECK-T1-NEXT: [[DEC50]] = add i32 [[BLOCKSIZE1_11071]], -1 +; CHECK-T1-NEXT: [[CMP16:%.*]] = icmp eq i32 [[DEC50]], 0 +; CHECK-T1-NEXT: br i1 [[CMP16]], label [[EXIT_LOOPEXIT:%.*]], label [[WHILE_BODY18]] +; CHECK-T1: exit.loopexit: +; CHECK-T1-NEXT: br label [[EXIT]] +; CHECK-T1: exit: +; CHECK-T1-NEXT: ret void +; +; CHECK-T2-LABEL: @arm_conv_fast_q15( +; CHECK-T2-NEXT: entry: +; CHECK-T2-NEXT: [[CMP:%.*]] = icmp ult i32 [[SRCALEN:%.*]], [[SRCBLEN:%.*]] +; CHECK-T2-NEXT: [[SRCALEN_SRCBLEN:%.*]] = select i1 [[CMP]], i32 [[SRCALEN]], i32 [[SRCBLEN]] +; CHECK-T2-NEXT: [[PSRCB_PSRCA:%.*]] = select i1 [[CMP]], i16* [[PSRCB:%.*]], i16* [[PSRCA:%.*]] +; CHECK-T2-NEXT: [[PSRCA_PSRCB:%.*]] = select i1 [[CMP]], i16* [[PSRCA]], i16* [[PSRCB]] +; CHECK-T2-NEXT: [[SUB:%.*]] = add i32 [[SRCALEN_SRCBLEN]], -1 +; CHECK-T2-NEXT: [[CMP41080:%.*]] = icmp eq i32 [[SUB]], 0 +; CHECK-T2-NEXT: br i1 [[CMP41080]], label [[WHILE_END13:%.*]], label [[WHILE_COND5_PREHEADER_PREHEADER:%.*]] +; CHECK-T2: while.cond5.preheader.preheader: +; CHECK-T2-NEXT: [[TMP0:%.*]] = add i32 [[SRCALEN_SRCBLEN]], -2 +; CHECK-T2-NEXT: [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 2 +; CHECK-T2-NEXT: [[UMIN:%.*]] = select i1 [[TMP1]], i32 [[TMP0]], i32 2 +; CHECK-T2-NEXT: br label [[WHILE_COND5_PREHEADER:%.*]] +; CHECK-T2: while.cond5.preheader: +; CHECK-T2-NEXT: [[COUNT_01084:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_END:%.*]] ], [ 1, [[WHILE_COND5_PREHEADER_PREHEADER]] ] +; CHECK-T2-NEXT: [[BLOCKSIZE1_01083:%.*]] = phi i32 [ [[DEC12:%.*]], [[WHILE_END]] ], [ [[SUB]], [[WHILE_COND5_PREHEADER_PREHEADER]] ] +; CHECK-T2-NEXT: [[PY_01082:%.*]] = phi i16* [ [[ADD_PTR:%.*]], [[WHILE_END]] ], [ [[PSRCA_PSRCB]], [[WHILE_COND5_PREHEADER_PREHEADER]] ] +; CHECK-T2-NEXT: [[POUT_01081:%.*]] = phi i16* [ [[INCDEC_PTR11:%.*]], [[WHILE_END]] ], [ [[PDST:%.*]], [[WHILE_COND5_PREHEADER_PREHEADER]] ] +; CHECK-T2-NEXT: br label [[WHILE_BODY7:%.*]] +; CHECK-T2: while.body7: +; CHECK-T2-NEXT: [[K_01078:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY7]] ], [ [[COUNT_01084]], [[WHILE_COND5_PREHEADER]] ] +; CHECK-T2-NEXT: [[SUM_01077:%.*]] = phi i32 [ [[ADD6_I:%.*]], [[WHILE_BODY7]] ], [ 0, [[WHILE_COND5_PREHEADER]] ] +; CHECK-T2-NEXT: [[PY_11076:%.*]] = phi i16* [ [[INCDEC_PTR8:%.*]], [[WHILE_BODY7]] ], [ [[PY_01082]], [[WHILE_COND5_PREHEADER]] ] +; CHECK-T2-NEXT: [[PX_11075:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY7]] ], [ [[PSRCB_PSRCA]], [[WHILE_COND5_PREHEADER]] ] +; CHECK-T2-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[PX_11075]], i32 1 +; CHECK-T2-NEXT: [[TMP2:%.*]] = load i16, i16* [[PX_11075]], align 2 +; CHECK-T2-NEXT: [[CONV:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-T2-NEXT: [[INCDEC_PTR8]] = getelementptr inbounds i16, i16* [[PY_11076]], i32 -1 +; CHECK-T2-NEXT: [[TMP3:%.*]] = load i16, i16* [[PY_11076]], align 2 +; CHECK-T2-NEXT: [[CONV9:%.*]] = sext i16 [[TMP3]] to i32 +; CHECK-T2-NEXT: [[MUL_I:%.*]] = mul nsw i32 [[CONV9]], [[CONV]] +; CHECK-T2-NEXT: [[SHR3_I:%.*]] = ashr i32 [[CONV]], 16 +; CHECK-T2-NEXT: [[SHR4_I:%.*]] = ashr i32 [[CONV9]], 16 +; CHECK-T2-NEXT: [[MUL5_I:%.*]] = mul nsw i32 [[SHR4_I]], [[SHR3_I]] +; CHECK-T2-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[SUM_01077]] +; CHECK-T2-NEXT: [[ADD6_I]] = add i32 [[ADD_I]], [[MUL5_I]] +; CHECK-T2-NEXT: [[DEC]] = add nsw i32 [[K_01078]], -1 +; CHECK-T2-NEXT: [[CMP6:%.*]] = icmp eq i32 [[DEC]], 0 +; CHECK-T2-NEXT: br i1 [[CMP6]], label [[WHILE_END]], label [[WHILE_BODY7]] +; CHECK-T2: while.end: +; CHECK-T2-NEXT: [[ADD6_I_LCSSA:%.*]] = phi i32 [ [[ADD6_I]], [[WHILE_BODY7]] ] +; CHECK-T2-NEXT: [[TMP4:%.*]] = lshr i32 [[ADD6_I_LCSSA]], 15 +; CHECK-T2-NEXT: [[CONV10:%.*]] = trunc i32 [[TMP4]] to i16 +; CHECK-T2-NEXT: [[INCDEC_PTR11]] = getelementptr inbounds i16, i16* [[POUT_01081]], i32 1 +; CHECK-T2-NEXT: store i16 [[CONV10]], i16* [[POUT_01081]], align 2 +; CHECK-T2-NEXT: [[ADD_PTR]] = getelementptr inbounds i16, i16* [[PSRCA_PSRCB]], i32 [[COUNT_01084]] +; CHECK-T2-NEXT: [[INC]] = add nuw nsw i32 [[COUNT_01084]], 1 +; CHECK-T2-NEXT: [[DEC12]] = add i32 [[BLOCKSIZE1_01083]], -1 +; CHECK-T2-NEXT: [[CMP3:%.*]] = icmp ult i32 [[COUNT_01084]], 3 +; CHECK-T2-NEXT: [[CMP4:%.*]] = icmp ne i32 [[DEC12]], 0 +; CHECK-T2-NEXT: [[TMP5:%.*]] = and i1 [[CMP4]], [[CMP3]] +; CHECK-T2-NEXT: br i1 [[TMP5]], label [[WHILE_COND5_PREHEADER]], label [[WHILE_END13_LOOPEXIT:%.*]] +; CHECK-T2: while.end13.loopexit: +; CHECK-T2-NEXT: [[INCDEC_PTR11_LCSSA:%.*]] = phi i16* [ [[INCDEC_PTR11]], [[WHILE_END]] ] +; CHECK-T2-NEXT: [[ADD_PTR_LCSSA:%.*]] = phi i16* [ [[ADD_PTR]], [[WHILE_END]] ] +; CHECK-T2-NEXT: [[DEC12_LCSSA:%.*]] = phi i32 [ [[DEC12]], [[WHILE_END]] ] +; CHECK-T2-NEXT: [[TMP6:%.*]] = add nuw nsw i32 [[UMIN]], 2 +; CHECK-T2-NEXT: br label [[WHILE_END13]] +; CHECK-T2: while.end13: +; CHECK-T2-NEXT: [[POUT_0_LCSSA:%.*]] = phi i16* [ [[PDST]], [[ENTRY:%.*]] ], [ [[INCDEC_PTR11_LCSSA]], [[WHILE_END13_LOOPEXIT]] ] +; CHECK-T2-NEXT: [[PY_0_LCSSA:%.*]] = phi i16* [ [[PSRCA_PSRCB]], [[ENTRY]] ], [ [[ADD_PTR_LCSSA]], [[WHILE_END13_LOOPEXIT]] ] +; CHECK-T2-NEXT: [[BLOCKSIZE1_0_LCSSA:%.*]] = phi i32 [ [[SUB]], [[ENTRY]] ], [ [[DEC12_LCSSA]], [[WHILE_END13_LOOPEXIT]] ] +; CHECK-T2-NEXT: [[COUNT_0_LCSSA:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[TMP6]], [[WHILE_END13_LOOPEXIT]] ] +; CHECK-T2-NEXT: [[CMP161068:%.*]] = icmp eq i32 [[BLOCKSIZE1_0_LCSSA]], 0 +; CHECK-T2-NEXT: br i1 [[CMP161068]], label [[EXIT:%.*]], label [[WHILE_BODY18_PREHEADER:%.*]] +; CHECK-T2: while.body18.preheader: +; CHECK-T2-NEXT: [[ADD_PTR14:%.*]] = getelementptr inbounds i16, i16* [[PY_0_LCSSA]], i32 -1 +; CHECK-T2-NEXT: br label [[WHILE_BODY18:%.*]] +; CHECK-T2: while.body18: +; CHECK-T2-NEXT: [[COUNT_11072:%.*]] = phi i32 [ [[INC49:%.*]], [[WHILE_END43:%.*]] ], [ [[COUNT_0_LCSSA]], [[WHILE_BODY18_PREHEADER]] ] +; CHECK-T2-NEXT: [[BLOCKSIZE1_11071:%.*]] = phi i32 [ [[DEC50:%.*]], [[WHILE_END43]] ], [ [[BLOCKSIZE1_0_LCSSA]], [[WHILE_BODY18_PREHEADER]] ] +; CHECK-T2-NEXT: [[PY_21070:%.*]] = phi i16* [ [[ADD_PTR48:%.*]], [[WHILE_END43]] ], [ [[ADD_PTR14]], [[WHILE_BODY18_PREHEADER]] ] +; CHECK-T2-NEXT: [[POUT_11069:%.*]] = phi i16* [ [[INCDEC_PTR46:%.*]], [[WHILE_END43]] ], [ [[POUT_0_LCSSA]], [[WHILE_BODY18_PREHEADER]] ] +; CHECK-T2-NEXT: [[SHR19:%.*]] = lshr i32 [[COUNT_11072]], 2 +; CHECK-T2-NEXT: [[CMP211054:%.*]] = icmp eq i32 [[SHR19]], 0 +; CHECK-T2-NEXT: br i1 [[CMP211054]], label [[WHILE_END31:%.*]], label [[WHILE_BODY23_PREHEADER:%.*]] +; CHECK-T2: while.body23.preheader: +; CHECK-T2-NEXT: br label [[WHILE_BODY23:%.*]] +; CHECK-T2: while.body23: +; CHECK-T2-NEXT: [[K_11058:%.*]] = phi i32 [ [[DEC30:%.*]], [[WHILE_BODY23]] ], [ [[SHR19]], [[WHILE_BODY23_PREHEADER]] ] +; CHECK-T2-NEXT: [[SUM_11057:%.*]] = phi i32 [ [[ADD6_I878:%.*]], [[WHILE_BODY23]] ], [ 0, [[WHILE_BODY23_PREHEADER]] ] +; CHECK-T2-NEXT: [[PY_31056:%.*]] = phi i16* [ [[ADD_PTR_I884:%.*]], [[WHILE_BODY23]] ], [ [[PY_21070]], [[WHILE_BODY23_PREHEADER]] ] +; CHECK-T2-NEXT: [[PX_31055:%.*]] = phi i16* [ [[ADD_PTR_I890:%.*]], [[WHILE_BODY23]] ], [ [[PSRCB_PSRCA]], [[WHILE_BODY23_PREHEADER]] ] +; CHECK-T2-NEXT: [[ARRAYIDX_I907:%.*]] = getelementptr inbounds i16, i16* [[PX_31055]], i32 1 +; CHECK-T2-NEXT: [[TMP7:%.*]] = load i16, i16* [[ARRAYIDX_I907]], align 2 +; CHECK-T2-NEXT: [[TMP8:%.*]] = load i16, i16* [[PX_31055]], align 2 +; CHECK-T2-NEXT: [[ADD_PTR_I912:%.*]] = getelementptr inbounds i16, i16* [[PX_31055]], i32 2 +; CHECK-T2-NEXT: [[ARRAYIDX_I901:%.*]] = getelementptr inbounds i16, i16* [[PY_31056]], i32 1 +; CHECK-T2-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX_I901]], align 2 +; CHECK-T2-NEXT: [[TMP10:%.*]] = load i16, i16* [[PY_31056]], align 2 +; CHECK-T2-NEXT: [[ADD_PTR_I906:%.*]] = getelementptr inbounds i16, i16* [[PY_31056]], i32 -2 +; CHECK-T2-NEXT: [[SHR_I892:%.*]] = sext i16 [[TMP8]] to i32 +; CHECK-T2-NEXT: [[SHR1_I893:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-T2-NEXT: [[MUL_I894:%.*]] = mul nsw i32 [[SHR1_I893]], [[SHR_I892]] +; CHECK-T2-NEXT: [[SHR2_I895:%.*]] = sext i16 [[TMP7]] to i32 +; CHECK-T2-NEXT: [[SHR4_I897:%.*]] = sext i16 [[TMP10]] to i32 +; CHECK-T2-NEXT: [[MUL5_I898:%.*]] = mul nsw i32 [[SHR4_I897]], [[SHR2_I895]] +; CHECK-T2-NEXT: [[ADD_I899:%.*]] = add i32 [[MUL_I894]], [[SUM_11057]] +; CHECK-T2-NEXT: [[ADD6_I900:%.*]] = add i32 [[ADD_I899]], [[MUL5_I898]] +; CHECK-T2-NEXT: [[ARRAYIDX_I885:%.*]] = getelementptr inbounds i16, i16* [[PX_31055]], i32 3 +; CHECK-T2-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX_I885]], align 2 +; CHECK-T2-NEXT: [[TMP12:%.*]] = load i16, i16* [[ADD_PTR_I912]], align 2 +; CHECK-T2-NEXT: [[ADD_PTR_I890]] = getelementptr inbounds i16, i16* [[PX_31055]], i32 4 +; CHECK-T2-NEXT: [[ARRAYIDX_I879:%.*]] = getelementptr inbounds i16, i16* [[PY_31056]], i32 -1 +; CHECK-T2-NEXT: [[TMP13:%.*]] = load i16, i16* [[ARRAYIDX_I879]], align 2 +; CHECK-T2-NEXT: [[TMP14:%.*]] = load i16, i16* [[ADD_PTR_I906]], align 2 +; CHECK-T2-NEXT: [[ADD_PTR_I884]] = getelementptr inbounds i16, i16* [[PY_31056]], i32 -4 +; CHECK-T2-NEXT: [[SHR_I870:%.*]] = sext i16 [[TMP12]] to i32 +; CHECK-T2-NEXT: [[SHR1_I871:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-T2-NEXT: [[MUL_I872:%.*]] = mul nsw i32 [[SHR1_I871]], [[SHR_I870]] +; CHECK-T2-NEXT: [[SHR2_I873:%.*]] = sext i16 [[TMP11]] to i32 +; CHECK-T2-NEXT: [[SHR4_I875:%.*]] = sext i16 [[TMP14]] to i32 +; CHECK-T2-NEXT: [[MUL5_I876:%.*]] = mul nsw i32 [[SHR4_I875]], [[SHR2_I873]] +; CHECK-T2-NEXT: [[ADD_I877:%.*]] = add i32 [[ADD6_I900]], [[MUL_I872]] +; CHECK-T2-NEXT: [[ADD6_I878]] = add i32 [[ADD_I877]], [[MUL5_I876]] +; CHECK-T2-NEXT: [[DEC30]] = add nsw i32 [[K_11058]], -1 +; CHECK-T2-NEXT: [[CMP21:%.*]] = icmp eq i32 [[DEC30]], 0 +; CHECK-T2-NEXT: br i1 [[CMP21]], label [[WHILE_END31_LOOPEXIT:%.*]], label [[WHILE_BODY23]] +; CHECK-T2: while.end31.loopexit: +; CHECK-T2-NEXT: [[ADD_PTR_I890_LCSSA:%.*]] = phi i16* [ [[ADD_PTR_I890]], [[WHILE_BODY23]] ] +; CHECK-T2-NEXT: [[ADD_PTR_I884_LCSSA:%.*]] = phi i16* [ [[ADD_PTR_I884]], [[WHILE_BODY23]] ] +; CHECK-T2-NEXT: [[ADD6_I878_LCSSA:%.*]] = phi i32 [ [[ADD6_I878]], [[WHILE_BODY23]] ] +; CHECK-T2-NEXT: br label [[WHILE_END31]] +; CHECK-T2: while.end31: +; CHECK-T2-NEXT: [[PX_3_LCSSA:%.*]] = phi i16* [ [[PSRCB_PSRCA]], [[WHILE_BODY18]] ], [ [[ADD_PTR_I890_LCSSA]], [[WHILE_END31_LOOPEXIT]] ] +; CHECK-T2-NEXT: [[PY_3_LCSSA:%.*]] = phi i16* [ [[PY_21070]], [[WHILE_BODY18]] ], [ [[ADD_PTR_I884_LCSSA]], [[WHILE_END31_LOOPEXIT]] ] +; CHECK-T2-NEXT: [[SUM_1_LCSSA:%.*]] = phi i32 [ 0, [[WHILE_BODY18]] ], [ [[ADD6_I878_LCSSA]], [[WHILE_END31_LOOPEXIT]] ] +; CHECK-T2-NEXT: [[REM:%.*]] = and i32 [[COUNT_11072]], 3 +; CHECK-T2-NEXT: [[CMP341062:%.*]] = icmp eq i32 [[REM]], 0 +; CHECK-T2-NEXT: br i1 [[CMP341062]], label [[WHILE_END43]], label [[WHILE_BODY36_PREHEADER:%.*]] +; CHECK-T2: while.body36.preheader: +; CHECK-T2-NEXT: [[ADD_PTR32:%.*]] = getelementptr inbounds i16, i16* [[PY_3_LCSSA]], i32 1 +; CHECK-T2-NEXT: br label [[WHILE_BODY36:%.*]] +; CHECK-T2: while.body36: +; CHECK-T2-NEXT: [[K_21066:%.*]] = phi i32 [ [[DEC42:%.*]], [[WHILE_BODY36]] ], [ [[REM]], [[WHILE_BODY36_PREHEADER]] ] +; CHECK-T2-NEXT: [[SUM_21065:%.*]] = phi i32 [ [[ADD6_I868:%.*]], [[WHILE_BODY36]] ], [ [[SUM_1_LCSSA]], [[WHILE_BODY36_PREHEADER]] ] +; CHECK-T2-NEXT: [[PY_41064:%.*]] = phi i16* [ [[INCDEC_PTR39:%.*]], [[WHILE_BODY36]] ], [ [[ADD_PTR32]], [[WHILE_BODY36_PREHEADER]] ] +; CHECK-T2-NEXT: [[PX_41063:%.*]] = phi i16* [ [[INCDEC_PTR37:%.*]], [[WHILE_BODY36]] ], [ [[PX_3_LCSSA]], [[WHILE_BODY36_PREHEADER]] ] +; CHECK-T2-NEXT: [[INCDEC_PTR37]] = getelementptr inbounds i16, i16* [[PX_41063]], i32 1 +; CHECK-T2-NEXT: [[TMP15:%.*]] = load i16, i16* [[PX_41063]], align 2 +; CHECK-T2-NEXT: [[CONV38:%.*]] = sext i16 [[TMP15]] to i32 +; CHECK-T2-NEXT: [[INCDEC_PTR39]] = getelementptr inbounds i16, i16* [[PY_41064]], i32 -1 +; CHECK-T2-NEXT: [[TMP16:%.*]] = load i16, i16* [[PY_41064]], align 2 +; CHECK-T2-NEXT: [[CONV40:%.*]] = sext i16 [[TMP16]] to i32 +; CHECK-T2-NEXT: [[MUL_I863:%.*]] = mul nsw i32 [[CONV40]], [[CONV38]] +; CHECK-T2-NEXT: [[SHR3_I864:%.*]] = ashr i32 [[CONV38]], 16 +; CHECK-T2-NEXT: [[SHR4_I865:%.*]] = ashr i32 [[CONV40]], 16 +; CHECK-T2-NEXT: [[MUL5_I866:%.*]] = mul nsw i32 [[SHR4_I865]], [[SHR3_I864]] +; CHECK-T2-NEXT: [[ADD_I867:%.*]] = add i32 [[MUL_I863]], [[SUM_21065]] +; CHECK-T2-NEXT: [[ADD6_I868]] = add i32 [[ADD_I867]], [[MUL5_I866]] +; CHECK-T2-NEXT: [[DEC42]] = add nsw i32 [[K_21066]], -1 +; CHECK-T2-NEXT: [[CMP34:%.*]] = icmp eq i32 [[DEC42]], 0 +; CHECK-T2-NEXT: br i1 [[CMP34]], label [[WHILE_END43_LOOPEXIT:%.*]], label [[WHILE_BODY36]] +; CHECK-T2: while.end43.loopexit: +; CHECK-T2-NEXT: [[ADD6_I868_LCSSA:%.*]] = phi i32 [ [[ADD6_I868]], [[WHILE_BODY36]] ] +; CHECK-T2-NEXT: br label [[WHILE_END43]] +; CHECK-T2: while.end43: +; CHECK-T2-NEXT: [[SUM_2_LCSSA:%.*]] = phi i32 [ [[SUM_1_LCSSA]], [[WHILE_END31]] ], [ [[ADD6_I868_LCSSA]], [[WHILE_END43_LOOPEXIT]] ] +; CHECK-T2-NEXT: [[TMP17:%.*]] = lshr i32 [[SUM_2_LCSSA]], 15 +; CHECK-T2-NEXT: [[CONV45:%.*]] = trunc i32 [[TMP17]] to i16 +; CHECK-T2-NEXT: [[INCDEC_PTR46]] = getelementptr inbounds i16, i16* [[POUT_11069]], i32 1 +; CHECK-T2-NEXT: store i16 [[CONV45]], i16* [[POUT_11069]], align 2 +; CHECK-T2-NEXT: [[SUB47:%.*]] = add i32 [[COUNT_11072]], -1 +; CHECK-T2-NEXT: [[ADD_PTR48]] = getelementptr inbounds i16, i16* [[PSRCA_PSRCB]], i32 [[SUB47]] +; CHECK-T2-NEXT: [[INC49]] = add i32 [[COUNT_11072]], 1 +; CHECK-T2-NEXT: [[DEC50]] = add i32 [[BLOCKSIZE1_11071]], -1 +; CHECK-T2-NEXT: [[CMP16:%.*]] = icmp eq i32 [[DEC50]], 0 +; CHECK-T2-NEXT: br i1 [[CMP16]], label [[EXIT_LOOPEXIT:%.*]], label [[WHILE_BODY18]] +; CHECK-T2: exit.loopexit: +; CHECK-T2-NEXT: br label [[EXIT]] +; CHECK-T2: exit: +; CHECK-T2-NEXT: ret void +; +entry: + %cmp = icmp ult i32 %srcALen, %srcBLen + %srcALen.srcBLen = select i1 %cmp, i32 %srcALen, i32 %srcBLen + %pSrcB.pSrcA = select i1 %cmp, i16* %pSrcB, i16* %pSrcA + %pSrcA.pSrcB = select i1 %cmp, i16* %pSrcA, i16* %pSrcB + %sub = add i32 %srcALen.srcBLen, -1 + %cmp41080 = icmp eq i32 %sub, 0 + br i1 %cmp41080, label %while.end13, label %while.cond5.preheader + +while.cond5.preheader: ; preds = %while.end, %entry + %count.01084 = phi i32 [ %inc, %while.end ], [ 1, %entry ] + %blockSize1.01083 = phi i32 [ %dec12, %while.end ], [ %sub, %entry ] + %py.01082 = phi i16* [ %add.ptr, %while.end ], [ %pSrcA.pSrcB, %entry ] + %pOut.01081 = phi i16* [ %incdec.ptr11, %while.end ], [ %pDst, %entry ] + br label %while.body7 + +while.body7: ; preds = %while.body7, %while.cond5.preheader + %k.01078 = phi i32 [ %dec, %while.body7 ], [ %count.01084, %while.cond5.preheader ] + %sum.01077 = phi i32 [ %add6.i, %while.body7 ], [ 0, %while.cond5.preheader ] + %py.11076 = phi i16* [ %incdec.ptr8, %while.body7 ], [ %py.01082, %while.cond5.preheader ] + %px.11075 = phi i16* [ %incdec.ptr, %while.body7 ], [ %pSrcB.pSrcA, %while.cond5.preheader ] + %incdec.ptr = getelementptr inbounds i16, i16* %px.11075, i32 1 + %0 = load i16, i16* %px.11075, align 2 + %conv = sext i16 %0 to i32 + %incdec.ptr8 = getelementptr inbounds i16, i16* %py.11076, i32 -1 + %1 = load i16, i16* %py.11076, align 2 + %conv9 = sext i16 %1 to i32 + %mul.i = mul nsw i32 %conv9, %conv + %shr3.i = ashr i32 %conv, 16 + %shr4.i = ashr i32 %conv9, 16 + %mul5.i = mul nsw i32 %shr4.i, %shr3.i + %add.i = add i32 %mul.i, %sum.01077 + %add6.i = add i32 %add.i, %mul5.i + %dec = add nsw i32 %k.01078, -1 + %cmp6 = icmp eq i32 %dec, 0 + br i1 %cmp6, label %while.end, label %while.body7 + +while.end: ; preds = %while.body7 + %2 = lshr i32 %add6.i, 15 + %conv10 = trunc i32 %2 to i16 + %incdec.ptr11 = getelementptr inbounds i16, i16* %pOut.01081, i32 1 + store i16 %conv10, i16* %pOut.01081, align 2 + %add.ptr = getelementptr inbounds i16, i16* %pSrcA.pSrcB, i32 %count.01084 + %inc = add nuw nsw i32 %count.01084, 1 + %dec12 = add i32 %blockSize1.01083, -1 + %cmp3 = icmp ult i32 %count.01084, 3 + %cmp4 = icmp ne i32 %dec12, 0 + %3 = and i1 %cmp4, %cmp3 + br i1 %3, label %while.cond5.preheader, label %while.end13 + +while.end13: ; preds = %while.end, %entry + %pOut.0.lcssa = phi i16* [ %pDst, %entry ], [ %incdec.ptr11, %while.end ] + %py.0.lcssa = phi i16* [ %pSrcA.pSrcB, %entry ], [ %add.ptr, %while.end ] + %blockSize1.0.lcssa = phi i32 [ %sub, %entry ], [ %dec12, %while.end ] + %count.0.lcssa = phi i32 [ 1, %entry ], [ %inc, %while.end ] + %cmp161068 = icmp eq i32 %blockSize1.0.lcssa, 0 + br i1 %cmp161068, label %exit, label %while.body18.preheader + +while.body18.preheader: ; preds = %while.end13 + %add.ptr14 = getelementptr inbounds i16, i16* %py.0.lcssa, i32 -1 + br label %while.body18 + +while.body18: ; preds = %while.end43, %while.body18.preheader + %count.11072 = phi i32 [ %inc49, %while.end43 ], [ %count.0.lcssa, %while.body18.preheader ] + %blockSize1.11071 = phi i32 [ %dec50, %while.end43 ], [ %blockSize1.0.lcssa, %while.body18.preheader ] + %py.21070 = phi i16* [ %add.ptr48, %while.end43 ], [ %add.ptr14, %while.body18.preheader ] + %pOut.11069 = phi i16* [ %incdec.ptr46, %while.end43 ], [ %pOut.0.lcssa, %while.body18.preheader ] + %shr19 = lshr i32 %count.11072, 2 + %cmp211054 = icmp eq i32 %shr19, 0 + br i1 %cmp211054, label %while.end31, label %while.body23 + +while.body23: ; preds = %while.body23, %while.body18 + %k.11058 = phi i32 [ %dec30, %while.body23 ], [ %shr19, %while.body18 ] + %sum.11057 = phi i32 [ %add6.i878, %while.body23 ], [ 0, %while.body18 ] + %py.31056 = phi i16* [ %add.ptr.i884, %while.body23 ], [ %py.21070, %while.body18 ] + %px.31055 = phi i16* [ %add.ptr.i890, %while.body23 ], [ %pSrcB.pSrcA, %while.body18 ] + %arrayidx.i907 = getelementptr inbounds i16, i16* %px.31055, i32 1 + %4 = load i16, i16* %arrayidx.i907, align 2 + %5 = load i16, i16* %px.31055, align 2 + %add.ptr.i912 = getelementptr inbounds i16, i16* %px.31055, i32 2 + %arrayidx.i901 = getelementptr inbounds i16, i16* %py.31056, i32 1 + %6 = load i16, i16* %arrayidx.i901, align 2 + %7 = load i16, i16* %py.31056, align 2 + %add.ptr.i906 = getelementptr inbounds i16, i16* %py.31056, i32 -2 + %shr.i892 = sext i16 %5 to i32 + %shr1.i893 = sext i16 %6 to i32 + %mul.i894 = mul nsw i32 %shr1.i893, %shr.i892 + %shr2.i895 = sext i16 %4 to i32 + %shr4.i897 = sext i16 %7 to i32 + %mul5.i898 = mul nsw i32 %shr4.i897, %shr2.i895 + %add.i899 = add i32 %mul.i894, %sum.11057 + %add6.i900 = add i32 %add.i899, %mul5.i898 + %arrayidx.i885 = getelementptr inbounds i16, i16* %px.31055, i32 3 + %8 = load i16, i16* %arrayidx.i885, align 2 + %9 = load i16, i16* %add.ptr.i912, align 2 + %add.ptr.i890 = getelementptr inbounds i16, i16* %px.31055, i32 4 + %arrayidx.i879 = getelementptr inbounds i16, i16* %py.31056, i32 -1 + %10 = load i16, i16* %arrayidx.i879, align 2 + %11 = load i16, i16* %add.ptr.i906, align 2 + %add.ptr.i884 = getelementptr inbounds i16, i16* %py.31056, i32 -4 + %shr.i870 = sext i16 %9 to i32 + %shr1.i871 = sext i16 %10 to i32 + %mul.i872 = mul nsw i32 %shr1.i871, %shr.i870 + %shr2.i873 = sext i16 %8 to i32 + %shr4.i875 = sext i16 %11 to i32 + %mul5.i876 = mul nsw i32 %shr4.i875, %shr2.i873 + %add.i877 = add i32 %add6.i900, %mul.i872 + %add6.i878 = add i32 %add.i877, %mul5.i876 + %dec30 = add nsw i32 %k.11058, -1 + %cmp21 = icmp eq i32 %dec30, 0 + br i1 %cmp21, label %while.end31, label %while.body23 + +while.end31: ; preds = %while.body23, %while.body18 + %px.3.lcssa = phi i16* [ %pSrcB.pSrcA, %while.body18 ], [ %add.ptr.i890, %while.body23 ] + %py.3.lcssa = phi i16* [ %py.21070, %while.body18 ], [ %add.ptr.i884, %while.body23 ] + %sum.1.lcssa = phi i32 [ 0, %while.body18 ], [ %add6.i878, %while.body23 ] + %rem = and i32 %count.11072, 3 + %cmp341062 = icmp eq i32 %rem, 0 + br i1 %cmp341062, label %while.end43, label %while.body36.preheader + +while.body36.preheader: ; preds = %while.end31 + %add.ptr32 = getelementptr inbounds i16, i16* %py.3.lcssa, i32 1 + br label %while.body36 + +while.body36: ; preds = %while.body36, %while.body36.preheader + %k.21066 = phi i32 [ %dec42, %while.body36 ], [ %rem, %while.body36.preheader ] + %sum.21065 = phi i32 [ %add6.i868, %while.body36 ], [ %sum.1.lcssa, %while.body36.preheader ] + %py.41064 = phi i16* [ %incdec.ptr39, %while.body36 ], [ %add.ptr32, %while.body36.preheader ] + %px.41063 = phi i16* [ %incdec.ptr37, %while.body36 ], [ %px.3.lcssa, %while.body36.preheader ] + %incdec.ptr37 = getelementptr inbounds i16, i16* %px.41063, i32 1 + %12 = load i16, i16* %px.41063, align 2 + %conv38 = sext i16 %12 to i32 + %incdec.ptr39 = getelementptr inbounds i16, i16* %py.41064, i32 -1 + %13 = load i16, i16* %py.41064, align 2 + %conv40 = sext i16 %13 to i32 + %mul.i863 = mul nsw i32 %conv40, %conv38 + %shr3.i864 = ashr i32 %conv38, 16 + %shr4.i865 = ashr i32 %conv40, 16 + %mul5.i866 = mul nsw i32 %shr4.i865, %shr3.i864 + %add.i867 = add i32 %mul.i863, %sum.21065 + %add6.i868 = add i32 %add.i867, %mul5.i866 + %dec42 = add nsw i32 %k.21066, -1 + %cmp34 = icmp eq i32 %dec42, 0 + br i1 %cmp34, label %while.end43, label %while.body36 + +while.end43: ; preds = %while.body36, %while.end31 + %sum.2.lcssa = phi i32 [ %sum.1.lcssa, %while.end31 ], [ %add6.i868, %while.body36 ] + %14 = lshr i32 %sum.2.lcssa, 15 + %conv45 = trunc i32 %14 to i16 + %incdec.ptr46 = getelementptr inbounds i16, i16* %pOut.11069, i32 1 + store i16 %conv45, i16* %pOut.11069, align 2 + %sub47 = add i32 %count.11072, -1 + %add.ptr48 = getelementptr inbounds i16, i16* %pSrcA.pSrcB, i32 %sub47 + %inc49 = add i32 %count.11072, 1 + %dec50 = add i32 %blockSize1.11071, -1 + %cmp16 = icmp eq i32 %dec50, 0 + br i1 %cmp16, label %exit, label %while.body18 + +exit: ; preds = %while.end43, %while.end13 + ret void +} diff --git a/llvm/test/CodeGen/ARM/indvar-unroll-imm-cost.ll b/llvm/test/CodeGen/ARM/indvar-unroll-imm-cost.ll new file mode 100644 index 00000000000000..36749a03553eaa --- /dev/null +++ b/llvm/test/CodeGen/ARM/indvar-unroll-imm-cost.ll @@ -0,0 +1,578 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -indvars -loop-unroll -mtriple=thumbv8m.main %s -S -o - | FileCheck %s + +define dso_local arm_aapcscc void @test(i32* nocapture %pDest, i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i32 %blkCnt) local_unnamed_addr #0 { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP88:%.*]] = icmp eq i32 [[BLKCNT:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP88]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[I_092:%.*]] = phi i32 [ [[INC42:%.*]], [[FOR_END40:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[PDEST_ADDR_091:%.*]] = phi i32* [ [[PDEST_ADDR_2_LCSSA:%.*]], [[FOR_END40]] ], [ [[PDEST:%.*]], [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[PSRCA_ADDR_090:%.*]] = phi i16* [ [[PSRCA_ADDR_2_LCSSA:%.*]], [[FOR_END40]] ], [ [[PSRCA:%.*]], [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[PSRCB_ADDR_089:%.*]] = phi i16* [ [[PSRCB_ADDR_2_LCSSA:%.*]], [[FOR_END40]] ], [ [[PSRCB:%.*]], [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[TMP0:%.*]] = lshr i32 [[I_092]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[I_092]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 3 +; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP5]], 2147483644 +; CHECK-NEXT: [[CMP272:%.*]] = icmp eq i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[CMP272]], label [[FOR_END:%.*]], label [[FOR_BODY3_PREHEADER:%.*]] +; CHECK: for.body3.preheader: +; CHECK-NEXT: [[XTRAITER:%.*]] = and i32 [[TMP3]], 3 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP2]], 3 +; CHECK-NEXT: br i1 [[TMP7]], label [[FOR_END_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY3_PREHEADER_NEW:%.*]] +; CHECK: for.body3.preheader.new: +; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i32 [[TMP3]], [[XTRAITER]] +; CHECK-NEXT: br label [[FOR_BODY3:%.*]] +; CHECK: for.body3: +; CHECK-NEXT: [[J_076:%.*]] = phi i32 [ 0, [[FOR_BODY3_PREHEADER_NEW]] ], [ [[ADD24_3:%.*]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[PDEST_ADDR_175:%.*]] = phi i32* [ [[PDEST_ADDR_091]], [[FOR_BODY3_PREHEADER_NEW]] ], [ [[INCDEC_PTR_3:%.*]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[PSRCA_ADDR_174:%.*]] = phi i16* [ [[PSRCA_ADDR_090]], [[FOR_BODY3_PREHEADER_NEW]] ], [ [[ADD_PTR_3:%.*]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[PSRCB_ADDR_173:%.*]] = phi i16* [ [[PSRCB_ADDR_089]], [[FOR_BODY3_PREHEADER_NEW]] ], [ [[ADD_PTR23_3:%.*]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[NITER:%.*]] = phi i32 [ [[UNROLL_ITER]], [[FOR_BODY3_PREHEADER_NEW]] ], [ [[NITER_NSUB_3:%.*]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[TMP8:%.*]] = load i16, i16* [[PSRCA_ADDR_174]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP8]] to i32 +; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[PSRCB_ADDR_173]], align 2 +; CHECK-NEXT: [[CONV5:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV5]], [[CONV]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_174]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = load i16, i16* [[ARRAYIDX6]], align 2 +; CHECK-NEXT: [[CONV7:%.*]] = sext i16 [[TMP10]] to i32 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_173]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX8]], align 2 +; CHECK-NEXT: [[CONV9:%.*]] = sext i16 [[TMP11]] to i32 +; CHECK-NEXT: [[MUL10:%.*]] = mul nsw i32 [[CONV9]], [[CONV7]] +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_174]], i32 2 +; CHECK-NEXT: [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX11]], align 2 +; CHECK-NEXT: [[CONV12:%.*]] = sext i16 [[TMP12]] to i32 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_173]], i32 3 +; CHECK-NEXT: [[TMP13:%.*]] = load i16, i16* [[ARRAYIDX13]], align 2 +; CHECK-NEXT: [[CONV14:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[MUL15:%.*]] = mul nsw i32 [[CONV14]], [[CONV12]] +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_174]], i32 3 +; CHECK-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX17]], align 2 +; CHECK-NEXT: [[CONV18:%.*]] = sext i16 [[TMP14]] to i32 +; CHECK-NEXT: [[ADD21:%.*]] = add i32 [[MUL10]], [[MUL]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[ADD21]], [[CONV14]] +; CHECK-NEXT: [[ADD16:%.*]] = add i32 [[ADD]], [[MUL15]] +; CHECK-NEXT: [[ADD22:%.*]] = add i32 [[ADD16]], [[CONV18]] +; CHECK-NEXT: store i32 [[ADD22]], i32* [[PDEST_ADDR_175]], align 4 +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_174]], i32 4 +; CHECK-NEXT: [[ADD_PTR23:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_173]], i32 4 +; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[PDEST_ADDR_175]], i32 1 +; CHECK-NEXT: [[ADD24:%.*]] = add nuw nsw i32 [[J_076]], 4 +; CHECK-NEXT: [[NITER_NSUB:%.*]] = sub i32 [[NITER]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = load i16, i16* [[ADD_PTR]], align 2 +; CHECK-NEXT: [[CONV_1:%.*]] = sext i16 [[TMP15]] to i32 +; CHECK-NEXT: [[TMP16:%.*]] = load i16, i16* [[ADD_PTR23]], align 2 +; CHECK-NEXT: [[CONV5_1:%.*]] = sext i16 [[TMP16]] to i32 +; CHECK-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[CONV5_1]], [[CONV_1]] +; CHECK-NEXT: [[ARRAYIDX6_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR]], i32 1 +; CHECK-NEXT: [[TMP17:%.*]] = load i16, i16* [[ARRAYIDX6_1]], align 2 +; CHECK-NEXT: [[CONV7_1:%.*]] = sext i16 [[TMP17]] to i32 +; CHECK-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23]], i32 1 +; CHECK-NEXT: [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX8_1]], align 2 +; CHECK-NEXT: [[CONV9_1:%.*]] = sext i16 [[TMP18]] to i32 +; CHECK-NEXT: [[MUL10_1:%.*]] = mul nsw i32 [[CONV9_1]], [[CONV7_1]] +; CHECK-NEXT: [[ARRAYIDX11_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR]], i32 2 +; CHECK-NEXT: [[TMP19:%.*]] = load i16, i16* [[ARRAYIDX11_1]], align 2 +; CHECK-NEXT: [[CONV12_1:%.*]] = sext i16 [[TMP19]] to i32 +; CHECK-NEXT: [[ARRAYIDX13_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23]], i32 3 +; CHECK-NEXT: [[TMP20:%.*]] = load i16, i16* [[ARRAYIDX13_1]], align 2 +; CHECK-NEXT: [[CONV14_1:%.*]] = sext i16 [[TMP20]] to i32 +; CHECK-NEXT: [[MUL15_1:%.*]] = mul nsw i32 [[CONV14_1]], [[CONV12_1]] +; CHECK-NEXT: [[ARRAYIDX17_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR]], i32 3 +; CHECK-NEXT: [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX17_1]], align 2 +; CHECK-NEXT: [[CONV18_1:%.*]] = sext i16 [[TMP21]] to i32 +; CHECK-NEXT: [[ADD21_1:%.*]] = add i32 [[MUL10_1]], [[MUL_1]] +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[ADD21_1]], [[CONV14_1]] +; CHECK-NEXT: [[ADD16_1:%.*]] = add i32 [[ADD_1]], [[MUL15_1]] +; CHECK-NEXT: [[ADD22_1:%.*]] = add i32 [[ADD16_1]], [[CONV18_1]] +; CHECK-NEXT: store i32 [[ADD22_1]], i32* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR]], i32 4 +; CHECK-NEXT: [[ADD_PTR23_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23]], i32 4 +; CHECK-NEXT: [[INCDEC_PTR_1:%.*]] = getelementptr inbounds i32, i32* [[INCDEC_PTR]], i32 1 +; CHECK-NEXT: [[ADD24_1:%.*]] = add nuw nsw i32 [[ADD24]], 4 +; CHECK-NEXT: [[NITER_NSUB_1:%.*]] = sub i32 [[NITER_NSUB]], 1 +; CHECK-NEXT: [[TMP22:%.*]] = load i16, i16* [[ADD_PTR_1]], align 2 +; CHECK-NEXT: [[CONV_2:%.*]] = sext i16 [[TMP22]] to i32 +; CHECK-NEXT: [[TMP23:%.*]] = load i16, i16* [[ADD_PTR23_1]], align 2 +; CHECK-NEXT: [[CONV5_2:%.*]] = sext i16 [[TMP23]] to i32 +; CHECK-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[CONV5_2]], [[CONV_2]] +; CHECK-NEXT: [[ARRAYIDX6_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_1]], i32 1 +; CHECK-NEXT: [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX6_2]], align 2 +; CHECK-NEXT: [[CONV7_2:%.*]] = sext i16 [[TMP24]] to i32 +; CHECK-NEXT: [[ARRAYIDX8_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_1]], i32 1 +; CHECK-NEXT: [[TMP25:%.*]] = load i16, i16* [[ARRAYIDX8_2]], align 2 +; CHECK-NEXT: [[CONV9_2:%.*]] = sext i16 [[TMP25]] to i32 +; CHECK-NEXT: [[MUL10_2:%.*]] = mul nsw i32 [[CONV9_2]], [[CONV7_2]] +; CHECK-NEXT: [[ARRAYIDX11_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_1]], i32 2 +; CHECK-NEXT: [[TMP26:%.*]] = load i16, i16* [[ARRAYIDX11_2]], align 2 +; CHECK-NEXT: [[CONV12_2:%.*]] = sext i16 [[TMP26]] to i32 +; CHECK-NEXT: [[ARRAYIDX13_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_1]], i32 3 +; CHECK-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX13_2]], align 2 +; CHECK-NEXT: [[CONV14_2:%.*]] = sext i16 [[TMP27]] to i32 +; CHECK-NEXT: [[MUL15_2:%.*]] = mul nsw i32 [[CONV14_2]], [[CONV12_2]] +; CHECK-NEXT: [[ARRAYIDX17_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_1]], i32 3 +; CHECK-NEXT: [[TMP28:%.*]] = load i16, i16* [[ARRAYIDX17_2]], align 2 +; CHECK-NEXT: [[CONV18_2:%.*]] = sext i16 [[TMP28]] to i32 +; CHECK-NEXT: [[ADD21_2:%.*]] = add i32 [[MUL10_2]], [[MUL_2]] +; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD21_2]], [[CONV14_2]] +; CHECK-NEXT: [[ADD16_2:%.*]] = add i32 [[ADD_2]], [[MUL15_2]] +; CHECK-NEXT: [[ADD22_2:%.*]] = add i32 [[ADD16_2]], [[CONV18_2]] +; CHECK-NEXT: store i32 [[ADD22_2]], i32* [[INCDEC_PTR_1]], align 4 +; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_1]], i32 4 +; CHECK-NEXT: [[ADD_PTR23_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_1]], i32 4 +; CHECK-NEXT: [[INCDEC_PTR_2:%.*]] = getelementptr inbounds i32, i32* [[INCDEC_PTR_1]], i32 1 +; CHECK-NEXT: [[ADD24_2:%.*]] = add nuw nsw i32 [[ADD24_1]], 4 +; CHECK-NEXT: [[NITER_NSUB_2:%.*]] = sub i32 [[NITER_NSUB_1]], 1 +; CHECK-NEXT: [[TMP29:%.*]] = load i16, i16* [[ADD_PTR_2]], align 2 +; CHECK-NEXT: [[CONV_3:%.*]] = sext i16 [[TMP29]] to i32 +; CHECK-NEXT: [[TMP30:%.*]] = load i16, i16* [[ADD_PTR23_2]], align 2 +; CHECK-NEXT: [[CONV5_3:%.*]] = sext i16 [[TMP30]] to i32 +; CHECK-NEXT: [[MUL_3:%.*]] = mul nsw i32 [[CONV5_3]], [[CONV_3]] +; CHECK-NEXT: [[ARRAYIDX6_3:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_2]], i32 1 +; CHECK-NEXT: [[TMP31:%.*]] = load i16, i16* [[ARRAYIDX6_3]], align 2 +; CHECK-NEXT: [[CONV7_3:%.*]] = sext i16 [[TMP31]] to i32 +; CHECK-NEXT: [[ARRAYIDX8_3:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_2]], i32 1 +; CHECK-NEXT: [[TMP32:%.*]] = load i16, i16* [[ARRAYIDX8_3]], align 2 +; CHECK-NEXT: [[CONV9_3:%.*]] = sext i16 [[TMP32]] to i32 +; CHECK-NEXT: [[MUL10_3:%.*]] = mul nsw i32 [[CONV9_3]], [[CONV7_3]] +; CHECK-NEXT: [[ARRAYIDX11_3:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_2]], i32 2 +; CHECK-NEXT: [[TMP33:%.*]] = load i16, i16* [[ARRAYIDX11_3]], align 2 +; CHECK-NEXT: [[CONV12_3:%.*]] = sext i16 [[TMP33]] to i32 +; CHECK-NEXT: [[ARRAYIDX13_3:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_2]], i32 3 +; CHECK-NEXT: [[TMP34:%.*]] = load i16, i16* [[ARRAYIDX13_3]], align 2 +; CHECK-NEXT: [[CONV14_3:%.*]] = sext i16 [[TMP34]] to i32 +; CHECK-NEXT: [[MUL15_3:%.*]] = mul nsw i32 [[CONV14_3]], [[CONV12_3]] +; CHECK-NEXT: [[ARRAYIDX17_3:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_2]], i32 3 +; CHECK-NEXT: [[TMP35:%.*]] = load i16, i16* [[ARRAYIDX17_3]], align 2 +; CHECK-NEXT: [[CONV18_3:%.*]] = sext i16 [[TMP35]] to i32 +; CHECK-NEXT: [[ADD21_3:%.*]] = add i32 [[MUL10_3]], [[MUL_3]] +; CHECK-NEXT: [[ADD_3:%.*]] = add i32 [[ADD21_3]], [[CONV14_3]] +; CHECK-NEXT: [[ADD16_3:%.*]] = add i32 [[ADD_3]], [[MUL15_3]] +; CHECK-NEXT: [[ADD22_3:%.*]] = add i32 [[ADD16_3]], [[CONV18_3]] +; CHECK-NEXT: store i32 [[ADD22_3]], i32* [[INCDEC_PTR_2]], align 4 +; CHECK-NEXT: [[ADD_PTR_3]] = getelementptr inbounds i16, i16* [[ADD_PTR_2]], i32 4 +; CHECK-NEXT: [[ADD_PTR23_3]] = getelementptr inbounds i16, i16* [[ADD_PTR23_2]], i32 4 +; CHECK-NEXT: [[INCDEC_PTR_3]] = getelementptr inbounds i32, i32* [[INCDEC_PTR_2]], i32 1 +; CHECK-NEXT: [[ADD24_3]] = add nuw nsw i32 [[ADD24_2]], 4 +; CHECK-NEXT: [[NITER_NSUB_3]] = sub i32 [[NITER_NSUB_2]], 1 +; CHECK-NEXT: [[NITER_NCMP_3:%.*]] = icmp ne i32 [[NITER_NSUB_3]], 0 +; CHECK-NEXT: br i1 [[NITER_NCMP_3]], label [[FOR_BODY3]], label [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]] +; CHECK: for.end.loopexit.unr-lcssa.loopexit: +; CHECK-NEXT: [[ADD_PTR_LCSSA_PH_PH:%.*]] = phi i16* [ [[ADD_PTR_3]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[ADD_PTR23_LCSSA_PH_PH:%.*]] = phi i16* [ [[ADD_PTR23_3]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[INCDEC_PTR_LCSSA_PH_PH:%.*]] = phi i32* [ [[INCDEC_PTR_3]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[J_076_UNR_PH:%.*]] = phi i32 [ [[ADD24_3]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[PDEST_ADDR_175_UNR_PH:%.*]] = phi i32* [ [[INCDEC_PTR_3]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[PSRCA_ADDR_174_UNR_PH:%.*]] = phi i16* [ [[ADD_PTR_3]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[PSRCB_ADDR_173_UNR_PH:%.*]] = phi i16* [ [[ADD_PTR23_3]], [[FOR_BODY3]] ] +; CHECK-NEXT: br label [[FOR_END_LOOPEXIT_UNR_LCSSA]] +; CHECK: for.end.loopexit.unr-lcssa: +; CHECK-NEXT: [[ADD_PTR_LCSSA_PH:%.*]] = phi i16* [ undef, [[FOR_BODY3_PREHEADER]] ], [ [[ADD_PTR_LCSSA_PH_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[ADD_PTR23_LCSSA_PH:%.*]] = phi i16* [ undef, [[FOR_BODY3_PREHEADER]] ], [ [[ADD_PTR23_LCSSA_PH_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[INCDEC_PTR_LCSSA_PH:%.*]] = phi i32* [ undef, [[FOR_BODY3_PREHEADER]] ], [ [[INCDEC_PTR_LCSSA_PH_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[J_076_UNR:%.*]] = phi i32 [ 0, [[FOR_BODY3_PREHEADER]] ], [ [[J_076_UNR_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[PDEST_ADDR_175_UNR:%.*]] = phi i32* [ [[PDEST_ADDR_091]], [[FOR_BODY3_PREHEADER]] ], [ [[PDEST_ADDR_175_UNR_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[PSRCA_ADDR_174_UNR:%.*]] = phi i16* [ [[PSRCA_ADDR_090]], [[FOR_BODY3_PREHEADER]] ], [ [[PSRCA_ADDR_174_UNR_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[PSRCB_ADDR_173_UNR:%.*]] = phi i16* [ [[PSRCB_ADDR_089]], [[FOR_BODY3_PREHEADER]] ], [ [[PSRCB_ADDR_173_UNR_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0 +; CHECK-NEXT: br i1 [[LCMP_MOD]], label [[FOR_BODY3_EPIL_PREHEADER:%.*]], label [[FOR_END_LOOPEXIT:%.*]] +; CHECK: for.body3.epil.preheader: +; CHECK-NEXT: br label [[FOR_BODY3_EPIL:%.*]] +; CHECK: for.body3.epil: +; CHECK-NEXT: [[TMP36:%.*]] = load i16, i16* [[PSRCA_ADDR_174_UNR]], align 2 +; CHECK-NEXT: [[CONV_EPIL:%.*]] = sext i16 [[TMP36]] to i32 +; CHECK-NEXT: [[TMP37:%.*]] = load i16, i16* [[PSRCB_ADDR_173_UNR]], align 2 +; CHECK-NEXT: [[CONV5_EPIL:%.*]] = sext i16 [[TMP37]] to i32 +; CHECK-NEXT: [[MUL_EPIL:%.*]] = mul nsw i32 [[CONV5_EPIL]], [[CONV_EPIL]] +; CHECK-NEXT: [[ARRAYIDX6_EPIL:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_174_UNR]], i32 1 +; CHECK-NEXT: [[TMP38:%.*]] = load i16, i16* [[ARRAYIDX6_EPIL]], align 2 +; CHECK-NEXT: [[CONV7_EPIL:%.*]] = sext i16 [[TMP38]] to i32 +; CHECK-NEXT: [[ARRAYIDX8_EPIL:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_173_UNR]], i32 1 +; CHECK-NEXT: [[TMP39:%.*]] = load i16, i16* [[ARRAYIDX8_EPIL]], align 2 +; CHECK-NEXT: [[CONV9_EPIL:%.*]] = sext i16 [[TMP39]] to i32 +; CHECK-NEXT: [[MUL10_EPIL:%.*]] = mul nsw i32 [[CONV9_EPIL]], [[CONV7_EPIL]] +; CHECK-NEXT: [[ARRAYIDX11_EPIL:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_174_UNR]], i32 2 +; CHECK-NEXT: [[TMP40:%.*]] = load i16, i16* [[ARRAYIDX11_EPIL]], align 2 +; CHECK-NEXT: [[CONV12_EPIL:%.*]] = sext i16 [[TMP40]] to i32 +; CHECK-NEXT: [[ARRAYIDX13_EPIL:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_173_UNR]], i32 3 +; CHECK-NEXT: [[TMP41:%.*]] = load i16, i16* [[ARRAYIDX13_EPIL]], align 2 +; CHECK-NEXT: [[CONV14_EPIL:%.*]] = sext i16 [[TMP41]] to i32 +; CHECK-NEXT: [[MUL15_EPIL:%.*]] = mul nsw i32 [[CONV14_EPIL]], [[CONV12_EPIL]] +; CHECK-NEXT: [[ARRAYIDX17_EPIL:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_174_UNR]], i32 3 +; CHECK-NEXT: [[TMP42:%.*]] = load i16, i16* [[ARRAYIDX17_EPIL]], align 2 +; CHECK-NEXT: [[CONV18_EPIL:%.*]] = sext i16 [[TMP42]] to i32 +; CHECK-NEXT: [[ADD21_EPIL:%.*]] = add i32 [[MUL10_EPIL]], [[MUL_EPIL]] +; CHECK-NEXT: [[ADD_EPIL:%.*]] = add i32 [[ADD21_EPIL]], [[CONV14_EPIL]] +; CHECK-NEXT: [[ADD16_EPIL:%.*]] = add i32 [[ADD_EPIL]], [[MUL15_EPIL]] +; CHECK-NEXT: [[ADD22_EPIL:%.*]] = add i32 [[ADD16_EPIL]], [[CONV18_EPIL]] +; CHECK-NEXT: store i32 [[ADD22_EPIL]], i32* [[PDEST_ADDR_175_UNR]], align 4 +; CHECK-NEXT: [[ADD_PTR_EPIL:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_174_UNR]], i32 4 +; CHECK-NEXT: [[ADD_PTR23_EPIL:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_173_UNR]], i32 4 +; CHECK-NEXT: [[INCDEC_PTR_EPIL:%.*]] = getelementptr inbounds i32, i32* [[PDEST_ADDR_175_UNR]], i32 1 +; CHECK-NEXT: [[ADD24_EPIL:%.*]] = add nuw nsw i32 [[J_076_UNR]], 4 +; CHECK-NEXT: [[EPIL_ITER_SUB:%.*]] = sub i32 [[XTRAITER]], 1 +; CHECK-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i32 [[EPIL_ITER_SUB]], 0 +; CHECK-NEXT: br i1 [[EPIL_ITER_CMP]], label [[FOR_BODY3_EPIL_1:%.*]], label [[FOR_END_LOOPEXIT_EPILOG_LCSSA:%.*]] +; CHECK: for.end.loopexit.epilog-lcssa: +; CHECK-NEXT: [[ADD_PTR_LCSSA_PH1:%.*]] = phi i16* [ [[ADD_PTR_EPIL]], [[FOR_BODY3_EPIL]] ], [ [[ADD_PTR_EPIL_1:%.*]], [[FOR_BODY3_EPIL_1]] ], [ [[ADD_PTR_EPIL_2:%.*]], [[FOR_BODY3_EPIL_2:%.*]] ] +; CHECK-NEXT: [[ADD_PTR23_LCSSA_PH2:%.*]] = phi i16* [ [[ADD_PTR23_EPIL]], [[FOR_BODY3_EPIL]] ], [ [[ADD_PTR23_EPIL_1:%.*]], [[FOR_BODY3_EPIL_1]] ], [ [[ADD_PTR23_EPIL_2:%.*]], [[FOR_BODY3_EPIL_2]] ] +; CHECK-NEXT: [[INCDEC_PTR_LCSSA_PH3:%.*]] = phi i32* [ [[INCDEC_PTR_EPIL]], [[FOR_BODY3_EPIL]] ], [ [[INCDEC_PTR_EPIL_1:%.*]], [[FOR_BODY3_EPIL_1]] ], [ [[INCDEC_PTR_EPIL_2:%.*]], [[FOR_BODY3_EPIL_2]] ] +; CHECK-NEXT: br label [[FOR_END_LOOPEXIT]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: [[ADD_PTR_LCSSA:%.*]] = phi i16* [ [[ADD_PTR_LCSSA_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA]] ], [ [[ADD_PTR_LCSSA_PH1]], [[FOR_END_LOOPEXIT_EPILOG_LCSSA]] ] +; CHECK-NEXT: [[ADD_PTR23_LCSSA:%.*]] = phi i16* [ [[ADD_PTR23_LCSSA_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA]] ], [ [[ADD_PTR23_LCSSA_PH2]], [[FOR_END_LOOPEXIT_EPILOG_LCSSA]] ] +; CHECK-NEXT: [[INCDEC_PTR_LCSSA:%.*]] = phi i32* [ [[INCDEC_PTR_LCSSA_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA]] ], [ [[INCDEC_PTR_LCSSA_PH3]], [[FOR_END_LOOPEXIT_EPILOG_LCSSA]] ] +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: [[PSRCB_ADDR_1_LCSSA:%.*]] = phi i16* [ [[PSRCB_ADDR_089]], [[FOR_BODY]] ], [ [[ADD_PTR23_LCSSA]], [[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: [[PSRCA_ADDR_1_LCSSA:%.*]] = phi i16* [ [[PSRCA_ADDR_090]], [[FOR_BODY]] ], [ [[ADD_PTR_LCSSA]], [[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: [[PDEST_ADDR_1_LCSSA:%.*]] = phi i32* [ [[PDEST_ADDR_091]], [[FOR_BODY]] ], [ [[INCDEC_PTR_LCSSA]], [[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: [[J_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[TMP6]], [[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: [[REM:%.*]] = and i32 [[TMP4]], 3 +; CHECK-NEXT: [[ADD25:%.*]] = or i32 [[J_0_LCSSA]], [[REM]] +; CHECK-NEXT: [[CMP2780:%.*]] = icmp ugt i32 [[ADD25]], [[J_0_LCSSA]] +; CHECK-NEXT: br i1 [[CMP2780]], label [[FOR_BODY29_PREHEADER:%.*]], label [[FOR_END40]] +; CHECK: for.body29.preheader: +; CHECK-NEXT: [[TMP43:%.*]] = sub nsw i32 [[ADD25]], [[J_0_LCSSA]] +; CHECK-NEXT: [[TMP44:%.*]] = sub i32 [[ADD25]], [[J_0_LCSSA]] +; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[ADD25]], -1 +; CHECK-NEXT: [[TMP46:%.*]] = sub i32 [[TMP45]], [[J_0_LCSSA]] +; CHECK-NEXT: [[XTRAITER4:%.*]] = and i32 [[TMP44]], 3 +; CHECK-NEXT: [[LCMP_MOD5:%.*]] = icmp ne i32 [[XTRAITER4]], 0 +; CHECK-NEXT: br i1 [[LCMP_MOD5]], label [[FOR_BODY29_PROL_PREHEADER:%.*]], label [[FOR_BODY29_PROL_LOOPEXIT:%.*]] +; CHECK: for.body29.prol.preheader: +; CHECK-NEXT: br label [[FOR_BODY29_PROL:%.*]] +; CHECK: for.body29.prol: +; CHECK-NEXT: [[ARRAYIDX30_PROL:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_1_LCSSA]], i32 [[J_0_LCSSA]] +; CHECK-NEXT: [[TMP47:%.*]] = load i16, i16* [[ARRAYIDX30_PROL]], align 2 +; CHECK-NEXT: [[CONV31_PROL:%.*]] = sext i16 [[TMP47]] to i32 +; CHECK-NEXT: [[ARRAYIDX32_PROL:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_1_LCSSA]], i32 [[J_0_LCSSA]] +; CHECK-NEXT: [[TMP48:%.*]] = load i16, i16* [[ARRAYIDX32_PROL]], align 2 +; CHECK-NEXT: [[CONV33_PROL:%.*]] = sext i16 [[TMP48]] to i32 +; CHECK-NEXT: [[MUL34_PROL:%.*]] = mul nsw i32 [[CONV33_PROL]], [[CONV31_PROL]] +; CHECK-NEXT: [[TMP49:%.*]] = load i32, i32* [[PDEST_ADDR_1_LCSSA]], align 4 +; CHECK-NEXT: [[ADD35_PROL:%.*]] = add nsw i32 [[MUL34_PROL]], [[TMP49]] +; CHECK-NEXT: store i32 [[ADD35_PROL]], i32* [[PDEST_ADDR_1_LCSSA]], align 4 +; CHECK-NEXT: [[INCDEC_PTR36_PROL:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_1_LCSSA]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR37_PROL:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_1_LCSSA]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR38_PROL:%.*]] = getelementptr inbounds i32, i32* [[PDEST_ADDR_1_LCSSA]], i32 1 +; CHECK-NEXT: [[INC_PROL:%.*]] = add nuw i32 [[J_0_LCSSA]], 1 +; CHECK-NEXT: [[PROL_ITER_SUB:%.*]] = sub i32 [[XTRAITER4]], 1 +; CHECK-NEXT: [[PROL_ITER_CMP:%.*]] = icmp ne i32 [[PROL_ITER_SUB]], 0 +; CHECK-NEXT: br i1 [[PROL_ITER_CMP]], label [[FOR_BODY29_PROL_1:%.*]], label [[FOR_BODY29_PROL_LOOPEXIT_UNR_LCSSA:%.*]] +; CHECK: for.body29.prol.loopexit.unr-lcssa: +; CHECK-NEXT: [[J_184_UNR_PH:%.*]] = phi i32 [ [[INC_PROL]], [[FOR_BODY29_PROL]] ], [ [[INC_PROL_1:%.*]], [[FOR_BODY29_PROL_1]] ], [ [[INC_PROL_2:%.*]], [[FOR_BODY29_PROL_2:%.*]] ] +; CHECK-NEXT: [[PDEST_ADDR_283_UNR_PH:%.*]] = phi i32* [ [[INCDEC_PTR38_PROL]], [[FOR_BODY29_PROL]] ], [ [[INCDEC_PTR38_PROL_1:%.*]], [[FOR_BODY29_PROL_1]] ], [ [[INCDEC_PTR38_PROL_2:%.*]], [[FOR_BODY29_PROL_2]] ] +; CHECK-NEXT: [[PSRCA_ADDR_282_UNR_PH:%.*]] = phi i16* [ [[INCDEC_PTR36_PROL]], [[FOR_BODY29_PROL]] ], [ [[INCDEC_PTR36_PROL_1:%.*]], [[FOR_BODY29_PROL_1]] ], [ [[INCDEC_PTR36_PROL_2:%.*]], [[FOR_BODY29_PROL_2]] ] +; CHECK-NEXT: [[PSRCB_ADDR_281_UNR_PH:%.*]] = phi i16* [ [[INCDEC_PTR37_PROL]], [[FOR_BODY29_PROL]] ], [ [[INCDEC_PTR37_PROL_1:%.*]], [[FOR_BODY29_PROL_1]] ], [ [[INCDEC_PTR37_PROL_2:%.*]], [[FOR_BODY29_PROL_2]] ] +; CHECK-NEXT: br label [[FOR_BODY29_PROL_LOOPEXIT]] +; CHECK: for.body29.prol.loopexit: +; CHECK-NEXT: [[J_184_UNR:%.*]] = phi i32 [ [[J_0_LCSSA]], [[FOR_BODY29_PREHEADER]] ], [ [[J_184_UNR_PH]], [[FOR_BODY29_PROL_LOOPEXIT_UNR_LCSSA]] ] +; CHECK-NEXT: [[PDEST_ADDR_283_UNR:%.*]] = phi i32* [ [[PDEST_ADDR_1_LCSSA]], [[FOR_BODY29_PREHEADER]] ], [ [[PDEST_ADDR_283_UNR_PH]], [[FOR_BODY29_PROL_LOOPEXIT_UNR_LCSSA]] ] +; CHECK-NEXT: [[PSRCA_ADDR_282_UNR:%.*]] = phi i16* [ [[PSRCA_ADDR_1_LCSSA]], [[FOR_BODY29_PREHEADER]] ], [ [[PSRCA_ADDR_282_UNR_PH]], [[FOR_BODY29_PROL_LOOPEXIT_UNR_LCSSA]] ] +; CHECK-NEXT: [[PSRCB_ADDR_281_UNR:%.*]] = phi i16* [ [[PSRCB_ADDR_1_LCSSA]], [[FOR_BODY29_PREHEADER]] ], [ [[PSRCB_ADDR_281_UNR_PH]], [[FOR_BODY29_PROL_LOOPEXIT_UNR_LCSSA]] ] +; CHECK-NEXT: [[TMP50:%.*]] = icmp ult i32 [[TMP46]], 3 +; CHECK-NEXT: br i1 [[TMP50]], label [[FOR_END40_LOOPEXIT:%.*]], label [[FOR_BODY29_PREHEADER_NEW:%.*]] +; CHECK: for.body29.preheader.new: +; CHECK-NEXT: br label [[FOR_BODY29:%.*]] +; CHECK: for.body29: +; CHECK-NEXT: [[J_184:%.*]] = phi i32 [ [[J_184_UNR]], [[FOR_BODY29_PREHEADER_NEW]] ], [ [[INC_3:%.*]], [[FOR_BODY29]] ] +; CHECK-NEXT: [[PDEST_ADDR_283:%.*]] = phi i32* [ [[PDEST_ADDR_283_UNR]], [[FOR_BODY29_PREHEADER_NEW]] ], [ [[INCDEC_PTR38_3:%.*]], [[FOR_BODY29]] ] +; CHECK-NEXT: [[PSRCA_ADDR_282:%.*]] = phi i16* [ [[PSRCA_ADDR_282_UNR]], [[FOR_BODY29_PREHEADER_NEW]] ], [ [[INCDEC_PTR36_3:%.*]], [[FOR_BODY29]] ] +; CHECK-NEXT: [[PSRCB_ADDR_281:%.*]] = phi i16* [ [[PSRCB_ADDR_281_UNR]], [[FOR_BODY29_PREHEADER_NEW]] ], [ [[INCDEC_PTR37_3:%.*]], [[FOR_BODY29]] ] +; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_282]], i32 [[J_184]] +; CHECK-NEXT: [[TMP51:%.*]] = load i16, i16* [[ARRAYIDX30]], align 2 +; CHECK-NEXT: [[CONV31:%.*]] = sext i16 [[TMP51]] to i32 +; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_281]], i32 [[J_184]] +; CHECK-NEXT: [[TMP52:%.*]] = load i16, i16* [[ARRAYIDX32]], align 2 +; CHECK-NEXT: [[CONV33:%.*]] = sext i16 [[TMP52]] to i32 +; CHECK-NEXT: [[MUL34:%.*]] = mul nsw i32 [[CONV33]], [[CONV31]] +; CHECK-NEXT: [[TMP53:%.*]] = load i32, i32* [[PDEST_ADDR_283]], align 4 +; CHECK-NEXT: [[ADD35:%.*]] = add nsw i32 [[MUL34]], [[TMP53]] +; CHECK-NEXT: store i32 [[ADD35]], i32* [[PDEST_ADDR_283]], align 4 +; CHECK-NEXT: [[INCDEC_PTR36:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_282]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR37:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_281]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR38:%.*]] = getelementptr inbounds i32, i32* [[PDEST_ADDR_283]], i32 1 +; CHECK-NEXT: [[INC:%.*]] = add nuw i32 [[J_184]], 1 +; CHECK-NEXT: [[ARRAYIDX30_1:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36]], i32 [[INC]] +; CHECK-NEXT: [[TMP54:%.*]] = load i16, i16* [[ARRAYIDX30_1]], align 2 +; CHECK-NEXT: [[CONV31_1:%.*]] = sext i16 [[TMP54]] to i32 +; CHECK-NEXT: [[ARRAYIDX32_1:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37]], i32 [[INC]] +; CHECK-NEXT: [[TMP55:%.*]] = load i16, i16* [[ARRAYIDX32_1]], align 2 +; CHECK-NEXT: [[CONV33_1:%.*]] = sext i16 [[TMP55]] to i32 +; CHECK-NEXT: [[MUL34_1:%.*]] = mul nsw i32 [[CONV33_1]], [[CONV31_1]] +; CHECK-NEXT: [[TMP56:%.*]] = load i32, i32* [[INCDEC_PTR38]], align 4 +; CHECK-NEXT: [[ADD35_1:%.*]] = add nsw i32 [[MUL34_1]], [[TMP56]] +; CHECK-NEXT: store i32 [[ADD35_1]], i32* [[INCDEC_PTR38]], align 4 +; CHECK-NEXT: [[INCDEC_PTR36_1:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR37_1:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR38_1:%.*]] = getelementptr inbounds i32, i32* [[INCDEC_PTR38]], i32 1 +; CHECK-NEXT: [[INC_1:%.*]] = add nuw i32 [[INC]], 1 +; CHECK-NEXT: [[ARRAYIDX30_2:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36_1]], i32 [[INC_1]] +; CHECK-NEXT: [[TMP57:%.*]] = load i16, i16* [[ARRAYIDX30_2]], align 2 +; CHECK-NEXT: [[CONV31_2:%.*]] = sext i16 [[TMP57]] to i32 +; CHECK-NEXT: [[ARRAYIDX32_2:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37_1]], i32 [[INC_1]] +; CHECK-NEXT: [[TMP58:%.*]] = load i16, i16* [[ARRAYIDX32_2]], align 2 +; CHECK-NEXT: [[CONV33_2:%.*]] = sext i16 [[TMP58]] to i32 +; CHECK-NEXT: [[MUL34_2:%.*]] = mul nsw i32 [[CONV33_2]], [[CONV31_2]] +; CHECK-NEXT: [[TMP59:%.*]] = load i32, i32* [[INCDEC_PTR38_1]], align 4 +; CHECK-NEXT: [[ADD35_2:%.*]] = add nsw i32 [[MUL34_2]], [[TMP59]] +; CHECK-NEXT: store i32 [[ADD35_2]], i32* [[INCDEC_PTR38_1]], align 4 +; CHECK-NEXT: [[INCDEC_PTR36_2:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36_1]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR37_2:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37_1]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR38_2:%.*]] = getelementptr inbounds i32, i32* [[INCDEC_PTR38_1]], i32 1 +; CHECK-NEXT: [[INC_2:%.*]] = add nuw i32 [[INC_1]], 1 +; CHECK-NEXT: [[ARRAYIDX30_3:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36_2]], i32 [[INC_2]] +; CHECK-NEXT: [[TMP60:%.*]] = load i16, i16* [[ARRAYIDX30_3]], align 2 +; CHECK-NEXT: [[CONV31_3:%.*]] = sext i16 [[TMP60]] to i32 +; CHECK-NEXT: [[ARRAYIDX32_3:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37_2]], i32 [[INC_2]] +; CHECK-NEXT: [[TMP61:%.*]] = load i16, i16* [[ARRAYIDX32_3]], align 2 +; CHECK-NEXT: [[CONV33_3:%.*]] = sext i16 [[TMP61]] to i32 +; CHECK-NEXT: [[MUL34_3:%.*]] = mul nsw i32 [[CONV33_3]], [[CONV31_3]] +; CHECK-NEXT: [[TMP62:%.*]] = load i32, i32* [[INCDEC_PTR38_2]], align 4 +; CHECK-NEXT: [[ADD35_3:%.*]] = add nsw i32 [[MUL34_3]], [[TMP62]] +; CHECK-NEXT: store i32 [[ADD35_3]], i32* [[INCDEC_PTR38_2]], align 4 +; CHECK-NEXT: [[INCDEC_PTR36_3]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36_2]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR37_3]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37_2]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR38_3]] = getelementptr inbounds i32, i32* [[INCDEC_PTR38_2]], i32 1 +; CHECK-NEXT: [[INC_3]] = add nuw i32 [[INC_2]], 1 +; CHECK-NEXT: [[EXITCOND_3:%.*]] = icmp eq i32 [[INC_3]], [[ADD25]] +; CHECK-NEXT: br i1 [[EXITCOND_3]], label [[FOR_END40_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY29]] +; CHECK: for.end40.loopexit.unr-lcssa: +; CHECK-NEXT: br label [[FOR_END40_LOOPEXIT]] +; CHECK: for.end40.loopexit: +; CHECK-NEXT: [[SCEVGEP93:%.*]] = getelementptr i16, i16* [[PSRCB_ADDR_1_LCSSA]], i32 [[TMP43]] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i16, i16* [[PSRCA_ADDR_1_LCSSA]], i32 [[TMP43]] +; CHECK-NEXT: [[SCEVGEP94:%.*]] = getelementptr i32, i32* [[PDEST_ADDR_1_LCSSA]], i32 [[TMP43]] +; CHECK-NEXT: br label [[FOR_END40]] +; CHECK: for.end40: +; CHECK-NEXT: [[PSRCB_ADDR_2_LCSSA]] = phi i16* [ [[PSRCB_ADDR_1_LCSSA]], [[FOR_END]] ], [ [[SCEVGEP93]], [[FOR_END40_LOOPEXIT]] ] +; CHECK-NEXT: [[PSRCA_ADDR_2_LCSSA]] = phi i16* [ [[PSRCA_ADDR_1_LCSSA]], [[FOR_END]] ], [ [[SCEVGEP]], [[FOR_END40_LOOPEXIT]] ] +; CHECK-NEXT: [[PDEST_ADDR_2_LCSSA]] = phi i32* [ [[PDEST_ADDR_1_LCSSA]], [[FOR_END]] ], [ [[SCEVGEP94]], [[FOR_END40_LOOPEXIT]] ] +; CHECK-NEXT: [[INC42]] = add nuw i32 [[I_092]], 1 +; CHECK-NEXT: [[EXITCOND95:%.*]] = icmp eq i32 [[INC42]], [[BLKCNT]] +; CHECK-NEXT: br i1 [[EXITCOND95]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]] +; CHECK: for.body3.epil.1: +; CHECK-NEXT: [[TMP63:%.*]] = load i16, i16* [[ADD_PTR_EPIL]], align 2 +; CHECK-NEXT: [[CONV_EPIL_1:%.*]] = sext i16 [[TMP63]] to i32 +; CHECK-NEXT: [[TMP64:%.*]] = load i16, i16* [[ADD_PTR23_EPIL]], align 2 +; CHECK-NEXT: [[CONV5_EPIL_1:%.*]] = sext i16 [[TMP64]] to i32 +; CHECK-NEXT: [[MUL_EPIL_1:%.*]] = mul nsw i32 [[CONV5_EPIL_1]], [[CONV_EPIL_1]] +; CHECK-NEXT: [[ARRAYIDX6_EPIL_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_EPIL]], i32 1 +; CHECK-NEXT: [[TMP65:%.*]] = load i16, i16* [[ARRAYIDX6_EPIL_1]], align 2 +; CHECK-NEXT: [[CONV7_EPIL_1:%.*]] = sext i16 [[TMP65]] to i32 +; CHECK-NEXT: [[ARRAYIDX8_EPIL_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_EPIL]], i32 1 +; CHECK-NEXT: [[TMP66:%.*]] = load i16, i16* [[ARRAYIDX8_EPIL_1]], align 2 +; CHECK-NEXT: [[CONV9_EPIL_1:%.*]] = sext i16 [[TMP66]] to i32 +; CHECK-NEXT: [[MUL10_EPIL_1:%.*]] = mul nsw i32 [[CONV9_EPIL_1]], [[CONV7_EPIL_1]] +; CHECK-NEXT: [[ARRAYIDX11_EPIL_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_EPIL]], i32 2 +; CHECK-NEXT: [[TMP67:%.*]] = load i16, i16* [[ARRAYIDX11_EPIL_1]], align 2 +; CHECK-NEXT: [[CONV12_EPIL_1:%.*]] = sext i16 [[TMP67]] to i32 +; CHECK-NEXT: [[ARRAYIDX13_EPIL_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_EPIL]], i32 3 +; CHECK-NEXT: [[TMP68:%.*]] = load i16, i16* [[ARRAYIDX13_EPIL_1]], align 2 +; CHECK-NEXT: [[CONV14_EPIL_1:%.*]] = sext i16 [[TMP68]] to i32 +; CHECK-NEXT: [[MUL15_EPIL_1:%.*]] = mul nsw i32 [[CONV14_EPIL_1]], [[CONV12_EPIL_1]] +; CHECK-NEXT: [[ARRAYIDX17_EPIL_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_EPIL]], i32 3 +; CHECK-NEXT: [[TMP69:%.*]] = load i16, i16* [[ARRAYIDX17_EPIL_1]], align 2 +; CHECK-NEXT: [[CONV18_EPIL_1:%.*]] = sext i16 [[TMP69]] to i32 +; CHECK-NEXT: [[ADD21_EPIL_1:%.*]] = add i32 [[MUL10_EPIL_1]], [[MUL_EPIL_1]] +; CHECK-NEXT: [[ADD_EPIL_1:%.*]] = add i32 [[ADD21_EPIL_1]], [[CONV14_EPIL_1]] +; CHECK-NEXT: [[ADD16_EPIL_1:%.*]] = add i32 [[ADD_EPIL_1]], [[MUL15_EPIL_1]] +; CHECK-NEXT: [[ADD22_EPIL_1:%.*]] = add i32 [[ADD16_EPIL_1]], [[CONV18_EPIL_1]] +; CHECK-NEXT: store i32 [[ADD22_EPIL_1]], i32* [[INCDEC_PTR_EPIL]], align 4 +; CHECK-NEXT: [[ADD_PTR_EPIL_1]] = getelementptr inbounds i16, i16* [[ADD_PTR_EPIL]], i32 4 +; CHECK-NEXT: [[ADD_PTR23_EPIL_1]] = getelementptr inbounds i16, i16* [[ADD_PTR23_EPIL]], i32 4 +; CHECK-NEXT: [[INCDEC_PTR_EPIL_1]] = getelementptr inbounds i32, i32* [[INCDEC_PTR_EPIL]], i32 1 +; CHECK-NEXT: [[ADD24_EPIL_1:%.*]] = add nuw nsw i32 [[ADD24_EPIL]], 4 +; CHECK-NEXT: [[EPIL_ITER_SUB_1:%.*]] = sub i32 [[EPIL_ITER_SUB]], 1 +; CHECK-NEXT: [[EPIL_ITER_CMP_1:%.*]] = icmp ne i32 [[EPIL_ITER_SUB_1]], 0 +; CHECK-NEXT: br i1 [[EPIL_ITER_CMP_1]], label [[FOR_BODY3_EPIL_2]], label [[FOR_END_LOOPEXIT_EPILOG_LCSSA]] +; CHECK: for.body3.epil.2: +; CHECK-NEXT: [[TMP70:%.*]] = load i16, i16* [[ADD_PTR_EPIL_1]], align 2 +; CHECK-NEXT: [[CONV_EPIL_2:%.*]] = sext i16 [[TMP70]] to i32 +; CHECK-NEXT: [[TMP71:%.*]] = load i16, i16* [[ADD_PTR23_EPIL_1]], align 2 +; CHECK-NEXT: [[CONV5_EPIL_2:%.*]] = sext i16 [[TMP71]] to i32 +; CHECK-NEXT: [[MUL_EPIL_2:%.*]] = mul nsw i32 [[CONV5_EPIL_2]], [[CONV_EPIL_2]] +; CHECK-NEXT: [[ARRAYIDX6_EPIL_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_EPIL_1]], i32 1 +; CHECK-NEXT: [[TMP72:%.*]] = load i16, i16* [[ARRAYIDX6_EPIL_2]], align 2 +; CHECK-NEXT: [[CONV7_EPIL_2:%.*]] = sext i16 [[TMP72]] to i32 +; CHECK-NEXT: [[ARRAYIDX8_EPIL_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_EPIL_1]], i32 1 +; CHECK-NEXT: [[TMP73:%.*]] = load i16, i16* [[ARRAYIDX8_EPIL_2]], align 2 +; CHECK-NEXT: [[CONV9_EPIL_2:%.*]] = sext i16 [[TMP73]] to i32 +; CHECK-NEXT: [[MUL10_EPIL_2:%.*]] = mul nsw i32 [[CONV9_EPIL_2]], [[CONV7_EPIL_2]] +; CHECK-NEXT: [[ARRAYIDX11_EPIL_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_EPIL_1]], i32 2 +; CHECK-NEXT: [[TMP74:%.*]] = load i16, i16* [[ARRAYIDX11_EPIL_2]], align 2 +; CHECK-NEXT: [[CONV12_EPIL_2:%.*]] = sext i16 [[TMP74]] to i32 +; CHECK-NEXT: [[ARRAYIDX13_EPIL_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_EPIL_1]], i32 3 +; CHECK-NEXT: [[TMP75:%.*]] = load i16, i16* [[ARRAYIDX13_EPIL_2]], align 2 +; CHECK-NEXT: [[CONV14_EPIL_2:%.*]] = sext i16 [[TMP75]] to i32 +; CHECK-NEXT: [[MUL15_EPIL_2:%.*]] = mul nsw i32 [[CONV14_EPIL_2]], [[CONV12_EPIL_2]] +; CHECK-NEXT: [[ARRAYIDX17_EPIL_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_EPIL_1]], i32 3 +; CHECK-NEXT: [[TMP76:%.*]] = load i16, i16* [[ARRAYIDX17_EPIL_2]], align 2 +; CHECK-NEXT: [[CONV18_EPIL_2:%.*]] = sext i16 [[TMP76]] to i32 +; CHECK-NEXT: [[ADD21_EPIL_2:%.*]] = add i32 [[MUL10_EPIL_2]], [[MUL_EPIL_2]] +; CHECK-NEXT: [[ADD_EPIL_2:%.*]] = add i32 [[ADD21_EPIL_2]], [[CONV14_EPIL_2]] +; CHECK-NEXT: [[ADD16_EPIL_2:%.*]] = add i32 [[ADD_EPIL_2]], [[MUL15_EPIL_2]] +; CHECK-NEXT: [[ADD22_EPIL_2:%.*]] = add i32 [[ADD16_EPIL_2]], [[CONV18_EPIL_2]] +; CHECK-NEXT: store i32 [[ADD22_EPIL_2]], i32* [[INCDEC_PTR_EPIL_1]], align 4 +; CHECK-NEXT: [[ADD_PTR_EPIL_2]] = getelementptr inbounds i16, i16* [[ADD_PTR_EPIL_1]], i32 4 +; CHECK-NEXT: [[ADD_PTR23_EPIL_2]] = getelementptr inbounds i16, i16* [[ADD_PTR23_EPIL_1]], i32 4 +; CHECK-NEXT: [[INCDEC_PTR_EPIL_2]] = getelementptr inbounds i32, i32* [[INCDEC_PTR_EPIL_1]], i32 1 +; CHECK-NEXT: [[ADD24_EPIL_2:%.*]] = add nuw nsw i32 [[ADD24_EPIL_1]], 4 +; CHECK-NEXT: [[EPIL_ITER_SUB_2:%.*]] = sub i32 [[EPIL_ITER_SUB_1]], 1 +; CHECK-NEXT: br label [[FOR_END_LOOPEXIT_EPILOG_LCSSA]] +; CHECK: for.body29.prol.1: +; CHECK-NEXT: [[ARRAYIDX30_PROL_1:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36_PROL]], i32 [[INC_PROL]] +; CHECK-NEXT: [[TMP77:%.*]] = load i16, i16* [[ARRAYIDX30_PROL_1]], align 2 +; CHECK-NEXT: [[CONV31_PROL_1:%.*]] = sext i16 [[TMP77]] to i32 +; CHECK-NEXT: [[ARRAYIDX32_PROL_1:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37_PROL]], i32 [[INC_PROL]] +; CHECK-NEXT: [[TMP78:%.*]] = load i16, i16* [[ARRAYIDX32_PROL_1]], align 2 +; CHECK-NEXT: [[CONV33_PROL_1:%.*]] = sext i16 [[TMP78]] to i32 +; CHECK-NEXT: [[MUL34_PROL_1:%.*]] = mul nsw i32 [[CONV33_PROL_1]], [[CONV31_PROL_1]] +; CHECK-NEXT: [[TMP79:%.*]] = load i32, i32* [[INCDEC_PTR38_PROL]], align 4 +; CHECK-NEXT: [[ADD35_PROL_1:%.*]] = add nsw i32 [[MUL34_PROL_1]], [[TMP79]] +; CHECK-NEXT: store i32 [[ADD35_PROL_1]], i32* [[INCDEC_PTR38_PROL]], align 4 +; CHECK-NEXT: [[INCDEC_PTR36_PROL_1]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36_PROL]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR37_PROL_1]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37_PROL]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR38_PROL_1]] = getelementptr inbounds i32, i32* [[INCDEC_PTR38_PROL]], i32 1 +; CHECK-NEXT: [[INC_PROL_1]] = add nuw i32 [[INC_PROL]], 1 +; CHECK-NEXT: [[PROL_ITER_SUB_1:%.*]] = sub i32 [[PROL_ITER_SUB]], 1 +; CHECK-NEXT: [[PROL_ITER_CMP_1:%.*]] = icmp ne i32 [[PROL_ITER_SUB_1]], 0 +; CHECK-NEXT: br i1 [[PROL_ITER_CMP_1]], label [[FOR_BODY29_PROL_2]], label [[FOR_BODY29_PROL_LOOPEXIT_UNR_LCSSA]] +; CHECK: for.body29.prol.2: +; CHECK-NEXT: [[ARRAYIDX30_PROL_2:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36_PROL_1]], i32 [[INC_PROL_1]] +; CHECK-NEXT: [[TMP80:%.*]] = load i16, i16* [[ARRAYIDX30_PROL_2]], align 2 +; CHECK-NEXT: [[CONV31_PROL_2:%.*]] = sext i16 [[TMP80]] to i32 +; CHECK-NEXT: [[ARRAYIDX32_PROL_2:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37_PROL_1]], i32 [[INC_PROL_1]] +; CHECK-NEXT: [[TMP81:%.*]] = load i16, i16* [[ARRAYIDX32_PROL_2]], align 2 +; CHECK-NEXT: [[CONV33_PROL_2:%.*]] = sext i16 [[TMP81]] to i32 +; CHECK-NEXT: [[MUL34_PROL_2:%.*]] = mul nsw i32 [[CONV33_PROL_2]], [[CONV31_PROL_2]] +; CHECK-NEXT: [[TMP82:%.*]] = load i32, i32* [[INCDEC_PTR38_PROL_1]], align 4 +; CHECK-NEXT: [[ADD35_PROL_2:%.*]] = add nsw i32 [[MUL34_PROL_2]], [[TMP82]] +; CHECK-NEXT: store i32 [[ADD35_PROL_2]], i32* [[INCDEC_PTR38_PROL_1]], align 4 +; CHECK-NEXT: [[INCDEC_PTR36_PROL_2]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36_PROL_1]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR37_PROL_2]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37_PROL_1]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR38_PROL_2]] = getelementptr inbounds i32, i32* [[INCDEC_PTR38_PROL_1]], i32 1 +; CHECK-NEXT: [[INC_PROL_2]] = add nuw i32 [[INC_PROL_1]], 1 +; CHECK-NEXT: [[PROL_ITER_SUB_2:%.*]] = sub i32 [[PROL_ITER_SUB_1]], 1 +; CHECK-NEXT: br label [[FOR_BODY29_PROL_LOOPEXIT_UNR_LCSSA]] +; +entry: + %cmp88 = icmp eq i32 %blkCnt, 0 + br i1 %cmp88, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.end40, %entry + ret void + +for.body: ; preds = %for.end40, %entry + %i.092 = phi i32 [ %inc42, %for.end40 ], [ 0, %entry ] + %pDest.addr.091 = phi i32* [ %pDest.addr.2.lcssa, %for.end40 ], [ %pDest, %entry ] + %pSrcA.addr.090 = phi i16* [ %pSrcA.addr.2.lcssa, %for.end40 ], [ %pSrcA, %entry ] + %pSrcB.addr.089 = phi i16* [ %pSrcB.addr.2.lcssa, %for.end40 ], [ %pSrcB, %entry ] + %0 = lshr i32 %i.092, 2 + %1 = add nuw nsw i32 %0, 3 + %2 = and i32 %1, 2147483644 + %cmp272 = icmp eq i32 %0, 0 + br i1 %cmp272, label %for.end, label %for.body3 + +for.body3: ; preds = %for.body3, %for.body + %j.076 = phi i32 [ %add24, %for.body3 ], [ 0, %for.body ] + %pDest.addr.175 = phi i32* [ %incdec.ptr, %for.body3 ], [ %pDest.addr.091, %for.body ] + %pSrcA.addr.174 = phi i16* [ %add.ptr, %for.body3 ], [ %pSrcA.addr.090, %for.body ] + %pSrcB.addr.173 = phi i16* [ %add.ptr23, %for.body3 ], [ %pSrcB.addr.089, %for.body ] + %3 = load i16, i16* %pSrcA.addr.174, align 2 + %conv = sext i16 %3 to i32 + %4 = load i16, i16* %pSrcB.addr.173, align 2 + %conv5 = sext i16 %4 to i32 + %mul = mul nsw i32 %conv5, %conv + %arrayidx6 = getelementptr inbounds i16, i16* %pSrcA.addr.174, i32 1 + %5 = load i16, i16* %arrayidx6, align 2 + %conv7 = sext i16 %5 to i32 + %arrayidx8 = getelementptr inbounds i16, i16* %pSrcB.addr.173, i32 1 + %6 = load i16, i16* %arrayidx8, align 2 + %conv9 = sext i16 %6 to i32 + %mul10 = mul nsw i32 %conv9, %conv7 + %arrayidx11 = getelementptr inbounds i16, i16* %pSrcA.addr.174, i32 2 + %7 = load i16, i16* %arrayidx11, align 2 + %conv12 = sext i16 %7 to i32 + %arrayidx13 = getelementptr inbounds i16, i16* %pSrcB.addr.173, i32 3 + %8 = load i16, i16* %arrayidx13, align 2 + %conv14 = sext i16 %8 to i32 + %mul15 = mul nsw i32 %conv14, %conv12 + %arrayidx17 = getelementptr inbounds i16, i16* %pSrcA.addr.174, i32 3 + %9 = load i16, i16* %arrayidx17, align 2 + %conv18 = sext i16 %9 to i32 + %add21 = add i32 %mul10, %mul + %add = add i32 %add21, %conv14 + %add16 = add i32 %add, %mul15 + %add22 = add i32 %add16, %conv18 + store i32 %add22, i32* %pDest.addr.175, align 4 + %add.ptr = getelementptr inbounds i16, i16* %pSrcA.addr.174, i32 4 + %add.ptr23 = getelementptr inbounds i16, i16* %pSrcB.addr.173, i32 4 + %incdec.ptr = getelementptr inbounds i32, i32* %pDest.addr.175, i32 1 + %add24 = add nuw nsw i32 %j.076, 4 + %cmp2 = icmp ult i32 %add24, %0 + br i1 %cmp2, label %for.body3, label %for.end + +for.end: ; preds = %for.body3, %for.body + %pSrcB.addr.1.lcssa = phi i16* [ %pSrcB.addr.089, %for.body ], [ %add.ptr23, %for.body3 ] + %pSrcA.addr.1.lcssa = phi i16* [ %pSrcA.addr.090, %for.body ], [ %add.ptr, %for.body3 ] + %pDest.addr.1.lcssa = phi i32* [ %pDest.addr.091, %for.body ], [ %incdec.ptr, %for.body3 ] + %j.0.lcssa = phi i32 [ 0, %for.body ], [ %2, %for.body3 ] + %rem = and i32 %0, 3 + %add25 = or i32 %j.0.lcssa, %rem + %cmp2780 = icmp ugt i32 %add25, %j.0.lcssa + br i1 %cmp2780, label %for.body29.preheader, label %for.end40 + +for.body29.preheader: ; preds = %for.end + %10 = sub nsw i32 %add25, %j.0.lcssa + %scevgep93 = getelementptr i16, i16* %pSrcB.addr.1.lcssa, i32 %10 + br label %for.body29 + +for.body29: ; preds = %for.body29, %for.body29.preheader + %j.184 = phi i32 [ %inc, %for.body29 ], [ %j.0.lcssa, %for.body29.preheader ] + %pDest.addr.283 = phi i32* [ %incdec.ptr38, %for.body29 ], [ %pDest.addr.1.lcssa, %for.body29.preheader ] + %pSrcA.addr.282 = phi i16* [ %incdec.ptr36, %for.body29 ], [ %pSrcA.addr.1.lcssa, %for.body29.preheader ] + %pSrcB.addr.281 = phi i16* [ %incdec.ptr37, %for.body29 ], [ %pSrcB.addr.1.lcssa, %for.body29.preheader ] + %arrayidx30 = getelementptr inbounds i16, i16* %pSrcA.addr.282, i32 %j.184 + %11 = load i16, i16* %arrayidx30, align 2 + %conv31 = sext i16 %11 to i32 + %arrayidx32 = getelementptr inbounds i16, i16* %pSrcB.addr.281, i32 %j.184 + %12 = load i16, i16* %arrayidx32, align 2 + %conv33 = sext i16 %12 to i32 + %mul34 = mul nsw i32 %conv33, %conv31 + %13 = load i32, i32* %pDest.addr.283, align 4 + %add35 = add nsw i32 %mul34, %13 + store i32 %add35, i32* %pDest.addr.283, align 4 + %incdec.ptr36 = getelementptr inbounds i16, i16* %pSrcA.addr.282, i32 1 + %incdec.ptr37 = getelementptr inbounds i16, i16* %pSrcB.addr.281, i32 1 + %incdec.ptr38 = getelementptr inbounds i32, i32* %pDest.addr.283, i32 1 + %inc = add nuw i32 %j.184, 1 + %exitcond = icmp eq i32 %inc, %add25 + br i1 %exitcond, label %for.end40.loopexit, label %for.body29 + +for.end40.loopexit: ; preds = %for.body29 + %scevgep = getelementptr i16, i16* %pSrcA.addr.1.lcssa, i32 %10 + %scevgep94 = getelementptr i32, i32* %pDest.addr.1.lcssa, i32 %10 + br label %for.end40 + +for.end40: ; preds = %for.end40.loopexit, %for.end + %pSrcB.addr.2.lcssa = phi i16* [ %pSrcB.addr.1.lcssa, %for.end ], [ %scevgep93, %for.end40.loopexit ] + %pSrcA.addr.2.lcssa = phi i16* [ %pSrcA.addr.1.lcssa, %for.end ], [ %scevgep, %for.end40.loopexit ] + %pDest.addr.2.lcssa = phi i32* [ %pDest.addr.1.lcssa, %for.end ], [ %scevgep94, %for.end40.loopexit ] + %inc42 = add nuw i32 %i.092, 1 + %exitcond95 = icmp eq i32 %inc42, %blkCnt + br i1 %exitcond95, label %for.cond.cleanup, label %for.body +} diff --git a/llvm/test/CodeGen/MIR/Hexagon/bundled-call-site-info.mir b/llvm/test/CodeGen/MIR/Hexagon/bundled-call-site-info.mir index fec542223fc987..e4100543d3c711 100644 --- a/llvm/test/CodeGen/MIR/Hexagon/bundled-call-site-info.mir +++ b/llvm/test/CodeGen/MIR/Hexagon/bundled-call-site-info.mir @@ -1,3 +1,5 @@ +# We do not support the call site info for the target now, so we use the experimental option (-emit-call-site-info -debug-entry-values). + # RUN: llc -emit-call-site-info -debug-entry-values -run-pass=none -verify-machineinstrs -o - %s | FileCheck %s # Verify that it is possible to read and write MIR where a callSites entry diff --git a/llvm/test/CodeGen/MIR/X86/call-site-info-error4.mir b/llvm/test/CodeGen/MIR/X86/call-site-info-error4.mir index 2472aa707e1694..d5bd82c710803f 100644 --- a/llvm/test/CodeGen/MIR/X86/call-site-info-error4.mir +++ b/llvm/test/CodeGen/MIR/X86/call-site-info-error4.mir @@ -1,5 +1,5 @@ -# RUN: not llc -mtriple=x86_64-- -run-pass none %s -o - 2>&1 | FileCheck %s -# CHECK: Call site info provided but not used +# RUN: llc -emit-call-site-info -mtriple=x86_64-- -run-pass none %s -o - 2>&1 | FileCheck %s +# CHECK-NOT: Call site info provided but not used --- | define dso_local i32 @baa(i32 %a) local_unnamed_addr { entry: diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll b/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll index 0f90da66d60d9c..a37a9f0b6a5ce4 100644 --- a/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll +++ b/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll @@ -1,16 +1,18 @@ -; RUN: llc -mtriple powerpc-ibm-aix-xcoff -stop-after=machine-cp -verify-machineinstrs < %s | \ +; RUN: llc -mtriple powerpc-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr4 \ +; RUN: -mattr=-altivec -verify-machineinstrs < %s | \ ; RUN: FileCheck --check-prefixes=CHECK,32BIT %s ; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec \ ; RUN: -mtriple powerpc-ibm-aix-xcoff < %s | \ -; RUN: FileCheck --check-prefixes=CHECKASM,ASM32PWR4 %s +; RUN: FileCheck --check-prefixes=CHECKASM,ASM32 %s -; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -stop-after=machine-cp -verify-machineinstrs < %s | \ +; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr4 \ +; RUN: -mattr=-altivec -verify-machineinstrs < %s | \ ; RUN: FileCheck --check-prefixes=CHECK,64BIT %s ; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec \ ; RUN: -mtriple powerpc64-ibm-aix-xcoff < %s | \ -; RUN: FileCheck --check-prefixes=CHECKASM,ASM64PWR4 %s +; RUN: FileCheck --check-prefixes=CHECKASM,ASM64 %s %struct.S1 = type { [1 x i8] } @gS1 = external global %struct.S1, align 1 @@ -34,13 +36,13 @@ declare void @test_byval_1Byte(%struct.S1* byval(%struct.S1) align 1) ; CHECKASM-LABEL: .call_test_byval_1Byte: -; ASM32PWR4: stwu 1, -64(1) -; ASM32PWR4-NEXT: lwz [[REG:[0-9]+]], LC{{[0-9]+}}(2) -; ASM32PWR4-NEXT: lbz 3, 0([[REG]]) -; ASM32PWR4-NEXT: slwi 3, 3, 24 -; ASM32PWR4-NEXT: bl .test_byval_1Byte -; ASM32PWR4-NEXT: nop -; ASM32PWR4-NEXT: addi 1, 1, 64 +; ASM32: stwu 1, -64(1) +; ASM32-NEXT: lwz [[REG:[0-9]+]], LC{{[0-9]+}}(2) +; ASM32-NEXT: lbz 3, 0([[REG]]) +; ASM32-NEXT: slwi 3, 3, 24 +; ASM32-NEXT: bl .test_byval_1Byte +; ASM32-NEXT: nop +; ASM32-NEXT: addi 1, 1, 64 ; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 ; 64BIT-NEXT: renamable $x[[REG:[0-9]+]] = LDtoc @gS1, $x2 :: (load 8 from got) @@ -49,14 +51,14 @@ declare void @test_byval_1Byte(%struct.S1* byval(%struct.S1) align 1) ; 64BIT-NEXT: BL8_NOP , csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1 ; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 -; ASM64PWR4: std 0, 16(1) -; ASM64PWR4-NEXT: stdu 1, -112(1) -; ASM64PWR4-NEXT: ld [[REG:[0-9]+]], LC{{[0-9]+}}(2) -; ASM64PWR4-NEXT: lbz 3, 0([[REG]]) -; ASM64PWR4-NEXT: sldi 3, 3, 56 -; ASM64PWR4-NEXT: bl .test_byval_1Byte -; ASM64PWR4-NEXT: nop -; ASM64PWR4-NEXT: addi 1, 1, 112 +; ASM64: std 0, 16(1) +; ASM64-NEXT: stdu 1, -112(1) +; ASM64-NEXT: ld [[REG:[0-9]+]], LC{{[0-9]+}}(2) +; ASM64-NEXT: lbz 3, 0([[REG]]) +; ASM64-NEXT: sldi 3, 3, 56 +; ASM64-NEXT: bl .test_byval_1Byte +; ASM64-NEXT: nop +; ASM64-NEXT: addi 1, 1, 112 %struct.S2 = type { [2 x i8] } @@ -81,13 +83,13 @@ declare void @test_byval_2Byte(%struct.S2* byval(%struct.S2) align 1) ; CHECKASM-LABEL: .call_test_byval_2Byte: -; ASM32PWR4: stwu 1, -64(1) -; ASM32PWR4-NEXT: lwz [[REG:[0-9]+]], LC{{[0-9]+}}(2) -; ASM32PWR4-NEXT: lhz 3, 0([[REG]]) -; ASM32PWR4-NEXT: slwi 3, 3, 16 -; ASM32PWR4-NEXT: bl .test_byval_2Byte -; ASM32PWR4-NEXT: nop -; ASM32PWR4-NEXT: addi 1, 1, 64 +; ASM32: stwu 1, -64(1) +; ASM32-NEXT: lwz [[REG:[0-9]+]], LC{{[0-9]+}}(2) +; ASM32-NEXT: lhz 3, 0([[REG]]) +; ASM32-NEXT: slwi 3, 3, 16 +; ASM32-NEXT: bl .test_byval_2Byte +; ASM32-NEXT: nop +; ASM32-NEXT: addi 1, 1, 64 ; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 ; 64BIT-NEXT: renamable $x[[REG:[0-9]+]] = LDtoc @gS2, $x2 :: (load 8 from got) @@ -96,14 +98,14 @@ declare void @test_byval_2Byte(%struct.S2* byval(%struct.S2) align 1) ; 64BIT-NEXT: BL8_NOP , csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1 ; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 -; ASM64PWR4: std 0, 16(1) -; ASM64PWR4-NEXT: stdu 1, -112(1) -; ASM64PWR4-NEXT: ld [[REG:[0-9]+]], LC{{[0-9]+}}(2) -; ASM64PWR4-NEXT: lhz 3, 0([[REG]]) -; ASM64PWR4-NEXT: sldi 3, 3, 48 -; ASM64PWR4-NEXT: bl .test_byval_2Byte -; ASM64PWR4-NEXT: nop -; ASM64PWR4-NEXT: addi 1, 1, 112 +; ASM64: std 0, 16(1) +; ASM64-NEXT: stdu 1, -112(1) +; ASM64-NEXT: ld [[REG:[0-9]+]], LC{{[0-9]+}}(2) +; ASM64-NEXT: lhz 3, 0([[REG]]) +; ASM64-NEXT: sldi 3, 3, 48 +; ASM64-NEXT: bl .test_byval_2Byte +; ASM64-NEXT: nop +; ASM64-NEXT: addi 1, 1, 112 %struct.S3 = type { [3 x i8] } @@ -132,14 +134,14 @@ declare void @test_byval_3Byte(%struct.S3* byval(%struct.S3) align 1) ; CHECKASM-LABEL: .call_test_byval_3Byte: ; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings. -; ASM32PWR4: stwu 1, -64(1) -; ASM32PWR4-NEXT: lwz [[REGADDR:[0-9]+]], LC{{[0-9]+}}(2) -; ASM32PWR4-DAG: lhz [[REG1:[0-9]+]], 0([[REGADDR]]) -; ASM32PWR4-DAG: lbz [[REG2:[0-9]+]], 2([[REGADDR]]) -; ASM32PWR4-DAG: rlwinm 3, [[REG2]], 8, 16, 23 -; ASM32PWR4-DAG: rlwimi 3, [[REG1]], 16, 0, 15 -; ASM32PWR4-NEXT: bl .test_byval_3Byte -; ASM32PWR4-NEXT: nop +; ASM32: stwu 1, -64(1) +; ASM32-NEXT: lwz [[REGADDR:[0-9]+]], LC{{[0-9]+}}(2) +; ASM32-DAG: lhz [[REG1:[0-9]+]], 0([[REGADDR]]) +; ASM32-DAG: lbz [[REG2:[0-9]+]], 2([[REGADDR]]) +; ASM32-DAG: rlwinm 3, [[REG2]], 8, 16, 23 +; ASM32-DAG: rlwimi 3, [[REG1]], 16, 0, 15 +; ASM32-NEXT: bl .test_byval_3Byte +; ASM32-NEXT: nop ; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings. ; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 @@ -152,14 +154,14 @@ declare void @test_byval_3Byte(%struct.S3* byval(%struct.S3) align 1) ; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 ; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings. -; ASM64PWR4: stdu 1, -112(1) -; ASM64PWR4-NEXT: ld [[REGADDR:[0-9]+]], LC{{[0-9]+}}(2) -; ASM64PWR4-DAG: lhz [[REG1:[0-9]+]], 0([[REGADDR]]) -; ASM64PWR4-DAG: lbz [[REG2:[0-9]+]], 2([[REGADDR]]) -; ASM64PWR4-DAG: rldic 3, [[REG2]], 40, 16 -; ASM64PWR4-DAG: rldimi 3, [[REG1]], 48, 0 -; ASM64PWR4-NEXT: bl .test_byval_3Byte -; ASM64PWR4-NEXT: nop +; ASM64: stdu 1, -112(1) +; ASM64-NEXT: ld [[REGADDR:[0-9]+]], LC{{[0-9]+}}(2) +; ASM64-DAG: lhz [[REG1:[0-9]+]], 0([[REGADDR]]) +; ASM64-DAG: lbz [[REG2:[0-9]+]], 2([[REGADDR]]) +; ASM64-DAG: rldic 3, [[REG2]], 40, 16 +; ASM64-DAG: rldimi 3, [[REG1]], 48, 0 +; ASM64-NEXT: bl .test_byval_3Byte +; ASM64-NEXT: nop %struct.S4 = type { [4 x i8] } @@ -183,12 +185,12 @@ declare void @test_byval_4Byte(%struct.S4* byval(%struct.S4) align 1) ; CHECKASM-LABEL: .call_test_byval_4Byte: -; ASM32PWR4: stwu 1, -64(1) -; ASM32PWR4-NEXT: lwz [[REG:[0-9]+]], LC{{[0-9]+}}(2) -; ASM32PWR4-NEXT: lwz 3, 0([[REG]]) -; ASM32PWR4-NEXT: bl .test_byval_4Byte -; ASM32PWR4-NEXT: nop -; ASM32PWR4-NEXT: addi 1, 1, 64 +; ASM32: stwu 1, -64(1) +; ASM32-NEXT: lwz [[REG:[0-9]+]], LC{{[0-9]+}}(2) +; ASM32-NEXT: lwz 3, 0([[REG]]) +; ASM32-NEXT: bl .test_byval_4Byte +; ASM32-NEXT: nop +; ASM32-NEXT: addi 1, 1, 64 ; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 ; 64BIT-NEXT: renamable $x[[REG:[0-9]+]] = LDtoc @gS4, $x2 :: (load 8 from got) @@ -197,10 +199,10 @@ declare void @test_byval_4Byte(%struct.S4* byval(%struct.S4) align 1) ; 64BIT-NEXT: BL8_NOP , csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1 ; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 -; ASM64PWR4: stdu 1, -112(1) -; ASM64PWR4-NEXT: ld [[REG:[0-9]+]], LC{{[0-9]+}}(2) -; ASM64PWR4-NEXT: lwz 3, 0([[REG]]) -; ASM64PWR4-NEXT: sldi 3, 3, 32 -; ASM64PWR4-NEXT: bl .test_byval_4Byte -; ASM64PWR4-NEXT: nop -; ASM64PWR4-NEXT: addi 1, 1, 112 +; ASM64: stdu 1, -112(1) +; ASM64-NEXT: ld [[REG:[0-9]+]], LC{{[0-9]+}}(2) +; ASM64-NEXT: lwz 3, 0([[REG]]) +; ASM64-NEXT: sldi 3, 3, 32 +; ASM64-NEXT: bl .test_byval_4Byte +; ASM64-NEXT: nop +; ASM64-NEXT: addi 1, 1, 112 diff --git a/llvm/test/CodeGen/PowerPC/aix64-cc-byval.ll b/llvm/test/CodeGen/PowerPC/aix64-cc-byval.ll index 599ab13530b840..e8135af274b545 100644 --- a/llvm/test/CodeGen/PowerPC/aix64-cc-byval.ll +++ b/llvm/test/CodeGen/PowerPC/aix64-cc-byval.ll @@ -1,9 +1,10 @@ -; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -stop-after=machine-cp -verify-machineinstrs < %s | \ -; RUN: FileCheck --check-prefixes=CHECK,64BIT %s +; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr4 \ +; RUN: -mattr=-altivec -verify-machineinstrs < %s | \ +; RUN: FileCheck %s ; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec \ ; RUN: -mtriple powerpc64-ibm-aix-xcoff < %s | \ -; RUN: FileCheck --check-prefixes=CHECKASM,ASM64PWR4 %s +; RUN: FileCheck --check-prefix=ASM %s %struct.S5 = type { [5 x i8] } @@ -19,27 +20,27 @@ declare void @test_byval_5Byte(%struct.S5* byval(%struct.S5) align 1) ; CHECK-LABEL: name: call_test_byval_5Byte{{.*}} -; CHECKASM-LABEL: .call_test_byval_5Byte: +; ASM-LABEL: .call_test_byval_5Byte: ; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings. -; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 -; 64BIT-NEXT: renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS5, $x2 :: (load 8 from got) -; 64BIT-DAG: renamable $x[[REG1:[0-9]+]] = LWZ8 0, killed renamable $x[[REGADDR]] :: (load 4) -; 64BIT-DAG: renamable $x[[REG2:[0-9]+]] = LBZ8 4, renamable $x[[REGADDR]] :: (load 1) -; 64BIT-DAG: renamable $x3 = RLWINM8 killed renamable $x[[REG2]], 24, 0, 7 -; 64BIT-DAG: renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x[[REG1]], 32, 0 -; 64BIT-NEXT: BL8_NOP , csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1 -; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 +; CHECK: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 +; CHECK-NEXT: renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS5, $x2 :: (load 8 from got) +; CHECK-DAG: renamable $x[[REG1:[0-9]+]] = LWZ8 0, killed renamable $x[[REGADDR]] :: (load 4) +; CHECK-DAG: renamable $x[[REG2:[0-9]+]] = LBZ8 4, renamable $x[[REGADDR]] :: (load 1) +; CHECK-DAG: renamable $x3 = RLWINM8 killed renamable $x[[REG2]], 24, 0, 7 +; CHECK-DAG: renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x[[REG1]], 32, 0 +; CHECK-NEXT: BL8_NOP , csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1 +; CHECK-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 ; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings. -; ASM64PWR4: stdu 1, -112(1) -; ASM64PWR4-NEXT: ld [[REGADDR:[0-9]+]], LC{{[0-9]+}}(2) -; ASM64PWR4-DAG: lwz [[REG1:[0-9]+]], 0([[REGADDR]]) -; ASM64PWR4-DAG: lbz [[REG2:[0-9]+]], 4([[REGADDR]]) -; ASM64PWR4-DAG: rlwinm 3, [[REG2]], 24, 0, 7 -; ASM64PWR4-DAG: rldimi 3, [[REG1]], 32, 0 -; ASM64PWR4-NEXT: bl .test_byval_5Byte -; ASM64PWR4-NEXT: nop +; ASM: stdu 1, -112(1) +; ASM-NEXT: ld [[REGADDR:[0-9]+]], LC{{[0-9]+}}(2) +; ASM-DAG: lwz [[REG1:[0-9]+]], 0([[REGADDR]]) +; ASM-DAG: lbz [[REG2:[0-9]+]], 4([[REGADDR]]) +; ASM-DAG: rlwinm 3, [[REG2]], 24, 0, 7 +; ASM-DAG: rldimi 3, [[REG1]], 32, 0 +; ASM-NEXT: bl .test_byval_5Byte +; ASM-NEXT: nop %struct.S6 = type { [6 x i8] } @@ -55,27 +56,27 @@ declare void @test_byval_6Byte(%struct.S6* byval(%struct.S6) align 1) ; CHECK-LABEL: name: call_test_byval_6Byte{{.*}} -; CHECKASM-LABEL: .call_test_byval_6Byte: +; ASM-LABEL: .call_test_byval_6Byte: ; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings. -; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 -; 64BIT-NEXT: renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS6, $x2 :: (load 8 from got) -; 64BIT-DAG: renamable $x[[REG1:[0-9]+]] = LWZ8 0, killed renamable $x[[REGADDR]] :: (load 4) -; 64BIT-DAG: renamable $x[[REG2:[0-9]+]] = LHZ8 4, renamable $x[[REGADDR]] :: (load 2) -; 64BIT-DAG: renamable $x3 = RLWINM8 killed renamable $x[[REG2]], 16, 0, 15 -; 64BIT-DAG: renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x[[REG1]], 32, 0 -; 64BIT-NEXT: BL8_NOP , csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1 -; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 +; CHECK: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 +; CHECK-NEXT: renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS6, $x2 :: (load 8 from got) +; CHECK-DAG: renamable $x[[REG1:[0-9]+]] = LWZ8 0, killed renamable $x[[REGADDR]] :: (load 4) +; CHECK-DAG: renamable $x[[REG2:[0-9]+]] = LHZ8 4, renamable $x[[REGADDR]] :: (load 2) +; CHECK-DAG: renamable $x3 = RLWINM8 killed renamable $x[[REG2]], 16, 0, 15 +; CHECK-DAG: renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x[[REG1]], 32, 0 +; CHECK-NEXT: BL8_NOP , csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1 +; CHECK-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 ; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings. -; ASM64PWR4: stdu 1, -112(1) -; ASM64PWR4-NEXT: ld [[REGADDR:[0-9]+]], LC{{[0-9]+}}(2) -; ASM64PWR4-DAG: lwz [[REG1:[0-9]+]], 0([[REGADDR]]) -; ASM64PWR4-DAG: lhz [[REG2:[0-9]+]], 4([[REGADDR]]) -; ASM64PWR4-DAG: rlwinm 3, [[REG2]], 16, 0, 15 -; ASM64PWR4-DAG: rldimi 3, [[REG1]], 32, 0 -; ASM64PWR4-NEXT: bl .test_byval_6Byte -; ASM64PWR4-NEXT: nop +; ASM: stdu 1, -112(1) +; ASM-NEXT: ld [[REGADDR:[0-9]+]], LC{{[0-9]+}}(2) +; ASM-DAG: lwz [[REG1:[0-9]+]], 0([[REGADDR]]) +; ASM-DAG: lhz [[REG2:[0-9]+]], 4([[REGADDR]]) +; ASM-DAG: rlwinm 3, [[REG2]], 16, 0, 15 +; ASM-DAG: rldimi 3, [[REG1]], 32, 0 +; ASM-NEXT: bl .test_byval_6Byte +; ASM-NEXT: nop %struct.S7 = type { [7 x i8] } @@ -91,31 +92,31 @@ declare void @test_byval_7Byte(%struct.S7* byval(%struct.S7) align 1) ; CHECK-LABEL: name: call_test_byval_7Byte{{.*}} -; CHECKASM-LABEL: .call_test_byval_7Byte: +; ASM-LABEL: .call_test_byval_7Byte: ; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings. -; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 -; 64BIT-NEXT: renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS7, $x2 :: (load 8 from got) -; 64BIT-DAG: renamable $x[[REG1:[0-9]+]] = LWZ8 0, killed renamable $x[[REGADDR]] :: (load 4) -; 64BIT-DAG: renamable $x[[REG2:[0-9]+]] = LHZ8 4, renamable $x[[REGADDR]] :: (load 2) -; 64BIT-DAG: renamable $x[[REG3:[0-9]+]] = LBZ8 6, renamable $x[[REGADDR]] :: (load 1) -; 64BIT-DAG: renamable $x3 = RLWINM8 killed renamable $x[[REG3]], 8, 16, 23 -; 64BIT-DAG: renamable $x3 = RLWIMI8 killed renamable $x3, killed renamable $x[[REG2]], 16, 0, 15 -; 64BIT-DAG: renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x[[REG1]], 32, 0 -; 64BIT-NEXT: BL8_NOP , csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1 -; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 +; CHECK: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 +; CHECK-NEXT: renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS7, $x2 :: (load 8 from got) +; CHECK-DAG: renamable $x[[REG1:[0-9]+]] = LWZ8 0, killed renamable $x[[REGADDR]] :: (load 4) +; CHECK-DAG: renamable $x[[REG2:[0-9]+]] = LHZ8 4, renamable $x[[REGADDR]] :: (load 2) +; CHECK-DAG: renamable $x[[REG3:[0-9]+]] = LBZ8 6, renamable $x[[REGADDR]] :: (load 1) +; CHECK-DAG: renamable $x3 = RLWINM8 killed renamable $x[[REG3]], 8, 16, 23 +; CHECK-DAG: renamable $x3 = RLWIMI8 killed renamable $x3, killed renamable $x[[REG2]], 16, 0, 15 +; CHECK-DAG: renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x[[REG1]], 32, 0 +; CHECK-NEXT: BL8_NOP , csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1 +; CHECK-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 ; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings. -; ASM64PWR4: stdu 1, -112(1) -; ASM64PWR4-NEXT: ld [[REGADDR:[0-9]+]], LC{{[0-9]+}}(2) -; ASM64PWR4-DAG: lwz [[REG1:[0-9]+]], 0([[REGADDR]]) -; ASM64PWR4-DAG: lhz [[REG2:[0-9]+]], 4([[REGADDR]]) -; ASM64PWR4-DAG: lbz [[REG3:[0-9]+]], 6([[REGADDR]]) -; ASM64PWR4-DAG: rlwinm 3, [[REG3]], 8, 16, 23 -; ASM64PWR4-DAG: rlwimi 3, [[REG2]], 16, 0, 15 -; ASM64PWR4-DAG: rldimi 3, [[REG1]], 32, 0 -; ASM64PWR4-NEXT: bl .test_byval_7Byte -; ASM64PWR4-NEXT: nop +; ASM: stdu 1, -112(1) +; ASM-NEXT: ld [[REGADDR:[0-9]+]], LC{{[0-9]+}}(2) +; ASM-DAG: lwz [[REG1:[0-9]+]], 0([[REGADDR]]) +; ASM-DAG: lhz [[REG2:[0-9]+]], 4([[REGADDR]]) +; ASM-DAG: lbz [[REG3:[0-9]+]], 6([[REGADDR]]) +; ASM-DAG: rlwinm 3, [[REG3]], 8, 16, 23 +; ASM-DAG: rlwimi 3, [[REG2]], 16, 0, 15 +; ASM-DAG: rldimi 3, [[REG1]], 32, 0 +; ASM-NEXT: bl .test_byval_7Byte +; ASM-NEXT: nop %struct.S8 = type { [8 x i8] } @@ -131,16 +132,16 @@ declare void @test_byval_8Byte(%struct.S8* byval(%struct.S8) align 1) ; CHECK-LABEL: name: call_test_byval_8Byte{{.*}} -; CHECKASM-LABEL: .call_test_byval_8Byte: +; ASM-LABEL: .call_test_byval_8Byte: -; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 -; 64BIT-NEXT: renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS8, $x2 :: (load 8 from got) -; 64BIT-NEXT: renamable $x3 = LD 0, killed renamable $x[[REGADDR]] :: (load 8) -; 64BIT-NEXT: BL8_NOP , csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1 -; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 +; CHECK: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 +; CHECK-NEXT: renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS8, $x2 :: (load 8 from got) +; CHECK-NEXT: renamable $x3 = LD 0, killed renamable $x[[REGADDR]] :: (load 8) +; CHECK-NEXT: BL8_NOP , csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1 +; CHECK-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 -; ASM64PWR4: stdu 1, -112(1) -; ASM64PWR4-NEXT: ld [[REGADDR:[0-9]+]], LC{{[0-9]+}}(2) -; ASM64PWR4-NEXT: ld 3, 0([[REGADDR]]) -; ASM64PWR4-NEXT: bl .test_byval_8Byte -; ASM64PWR4-NEXT: nop +; ASM: stdu 1, -112(1) +; ASM-NEXT: ld [[REGADDR:[0-9]+]], LC{{[0-9]+}}(2) +; ASM-NEXT: ld 3, 0([[REGADDR]]) +; ASM-NEXT: bl .test_byval_8Byte +; ASM-NEXT: nop diff --git a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir index df5b040b5dcd14..c922312eae12d8 100644 --- a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir +++ b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir @@ -550,6 +550,14 @@ ret i64 %and } + ; Function Attrs: norecurse nounwind readnone + define i64 @testRLDICL_MB0(i64 %a) local_unnamed_addr #0 { + entry: + %shr = lshr i64 %a, 11 + %and = and i64 %shr, 16777215 + ret i64 %and + } + ; Function Attrs: norecurse nounwind readnone define i64 @testRLDICL_rec(i64 %a, i64 %b) local_unnamed_addr #0 { entry: @@ -3882,6 +3890,52 @@ body: | $x3 = COPY %1 BLR8 implicit $lr8, implicit $rm, implicit $x3 +... +--- +name: testRLDICL_MB0 +# CHECK-ALL: name: testRLDICL_MB0 +alignment: 16 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc, preferred-register: '' } + - { id: 1, class: g8rc, preferred-register: '' } +liveins: + - { reg: '$x3', virtual-reg: '%0' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.entry: + liveins: $x3 + + %0 = LI8 32 + %1 = RLDICL %0, 60, 0 + ; CHECK: LI8 2 + ; CHECK-LATE: li 3, 2 + $x3 = COPY %1 + BLR8 implicit $lr8, implicit $rm, implicit $x3 + ... --- name: testRLDICL_rec diff --git a/llvm/test/CodeGen/PowerPC/float-vector-gather.ll b/llvm/test/CodeGen/PowerPC/float-vector-gather.ll index b7bb622a1f9078..672a8f3e82aa7f 100644 --- a/llvm/test/CodeGen/PowerPC/float-vector-gather.ll +++ b/llvm/test/CodeGen/PowerPC/float-vector-gather.ll @@ -1,4 +1,4 @@ -; NOTE: This test ensures that for both Big and Little Endian cases a set of +; NOTE: This test ensures that, for both Big and Little Endian cases, a set of ; NOTE: 4 floats is gathered into a v4f32 register using xxmrghw and xxmrgld ; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-vsr-nums-as-vr \ ; RUN: -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu < %s \ diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqdmlad.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqdmlad.ll new file mode 100644 index 00000000000000..0396f96457f43f --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vqdmlad.ll @@ -0,0 +1,589 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s + +define arm_aapcs_vfpcc <16 x i8> @test_vqdmladhq_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vqdmladhq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqdmladh.s8 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <16 x i8> @llvm.arm.mve.vqdmlad.v16i8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i32 0, i32 0, i32 0) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqdmladhq_s16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vqdmladhq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqdmladh.s16 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <8 x i16> @llvm.arm.mve.vqdmlad.v8i16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i32 0, i32 0, i32 0) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqdmladhq_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vqdmladhq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqdmladh.s32 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <4 x i32> @llvm.arm.mve.vqdmlad.v4i32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 0, i32 0, i32 0) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqdmladhxq_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vqdmladhxq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqdmladhx.s8 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <16 x i8> @llvm.arm.mve.vqdmlad.v16i8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i32 1, i32 0, i32 0) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqdmladhxq_s16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vqdmladhxq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqdmladhx.s16 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <8 x i16> @llvm.arm.mve.vqdmlad.v8i16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i32 1, i32 0, i32 0) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqdmladhxq_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vqdmladhxq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqdmladhx.s32 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <4 x i32> @llvm.arm.mve.vqdmlad.v4i32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 1, i32 0, i32 0) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqdmlsdhq_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vqdmlsdhq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqdmlsdh.s8 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <16 x i8> @llvm.arm.mve.vqdmlad.v16i8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i32 0, i32 0, i32 1) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqdmlsdhq_s16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vqdmlsdhq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqdmlsdh.s16 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <8 x i16> @llvm.arm.mve.vqdmlad.v8i16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i32 0, i32 0, i32 1) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsdhq_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vqdmlsdhq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqdmlsdh.s32 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <4 x i32> @llvm.arm.mve.vqdmlad.v4i32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 0, i32 0, i32 1) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqdmlsdhxq_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vqdmlsdhxq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqdmlsdhx.s8 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <16 x i8> @llvm.arm.mve.vqdmlad.v16i8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i32 1, i32 0, i32 1) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqdmlsdhxq_s16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vqdmlsdhxq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqdmlsdhx.s16 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <8 x i16> @llvm.arm.mve.vqdmlad.v8i16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i32 1, i32 0, i32 1) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsdhxq_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vqdmlsdhxq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqdmlsdhx.s32 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <4 x i32> @llvm.arm.mve.vqdmlad.v4i32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 1, i32 0, i32 1) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqrdmladhq_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vqrdmladhq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmladh.s8 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <16 x i8> @llvm.arm.mve.vqdmlad.v16i8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i32 0, i32 1, i32 0) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrdmladhq_s16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vqrdmladhq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmladh.s16 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <8 x i16> @llvm.arm.mve.vqdmlad.v8i16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i32 0, i32 1, i32 0) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqrdmladhq_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vqrdmladhq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmladh.s32 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <4 x i32> @llvm.arm.mve.vqdmlad.v4i32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 0, i32 1, i32 0) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqrdmladhxq_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vqrdmladhxq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmladhx.s8 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <16 x i8> @llvm.arm.mve.vqdmlad.v16i8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i32 1, i32 1, i32 0) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrdmladhxq_s16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vqrdmladhxq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmladhx.s16 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <8 x i16> @llvm.arm.mve.vqdmlad.v8i16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i32 1, i32 1, i32 0) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqrdmladhxq_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vqrdmladhxq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmladhx.s32 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <4 x i32> @llvm.arm.mve.vqdmlad.v4i32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 1, i32 1, i32 0) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqrdmlsdhq_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vqrdmlsdhq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmlsdh.s8 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <16 x i8> @llvm.arm.mve.vqdmlad.v16i8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i32 0, i32 1, i32 1) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrdmlsdhq_s16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vqrdmlsdhq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmlsdh.s16 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <8 x i16> @llvm.arm.mve.vqdmlad.v8i16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i32 0, i32 1, i32 1) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqrdmlsdhq_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vqrdmlsdhq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmlsdh.s32 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <4 x i32> @llvm.arm.mve.vqdmlad.v4i32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 0, i32 1, i32 1) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqrdmlsdhxq_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_vqrdmlsdhxq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmlsdhx.s8 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <16 x i8> @llvm.arm.mve.vqdmlad.v16i8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i32 1, i32 1, i32 1) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrdmlsdhxq_s16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_vqrdmlsdhxq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmlsdhx.s16 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <8 x i16> @llvm.arm.mve.vqdmlad.v8i16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i32 1, i32 1, i32 1) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqrdmlsdhxq_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_vqrdmlsdhxq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqrdmlsdhx.s32 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <4 x i32> @llvm.arm.mve.vqdmlad.v4i32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 1, i32 1, i32 1) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqdmladhq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqdmladhq_m_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqdmladht.s8 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = tail call <16 x i8> @llvm.arm.mve.vqdmlad.predicated.v16i8.v16i1(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i32 0, i32 0, i32 0, <16 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqdmladhq_m_s16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqdmladhq_m_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqdmladht.s16 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call <8 x i16> @llvm.arm.mve.vqdmlad.predicated.v8i16.v8i1(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i32 0, i32 0, i32 0, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqdmladhq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqdmladhq_m_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqdmladht.s32 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x i32> @llvm.arm.mve.vqdmlad.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 0, i32 0, i32 0, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqdmladhxq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqdmladhxq_m_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqdmladhxt.s8 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = tail call <16 x i8> @llvm.arm.mve.vqdmlad.predicated.v16i8.v16i1(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i32 1, i32 0, i32 0, <16 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqdmladhxq_m_s16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqdmladhxq_m_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqdmladhxt.s16 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call <8 x i16> @llvm.arm.mve.vqdmlad.predicated.v8i16.v8i1(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i32 1, i32 0, i32 0, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqdmladhxq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqdmladhxq_m_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqdmladhxt.s32 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x i32> @llvm.arm.mve.vqdmlad.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 1, i32 0, i32 0, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqdmlsdhq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqdmlsdhq_m_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqdmlsdht.s8 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = tail call <16 x i8> @llvm.arm.mve.vqdmlad.predicated.v16i8.v16i1(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i32 0, i32 0, i32 1, <16 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqdmlsdhq_m_s16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqdmlsdhq_m_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqdmlsdht.s16 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call <8 x i16> @llvm.arm.mve.vqdmlad.predicated.v8i16.v8i1(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i32 0, i32 0, i32 1, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsdhq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqdmlsdhq_m_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqdmlsdht.s32 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x i32> @llvm.arm.mve.vqdmlad.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 0, i32 0, i32 1, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqdmlsdhxq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqdmlsdhxq_m_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqdmlsdhxt.s8 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = tail call <16 x i8> @llvm.arm.mve.vqdmlad.predicated.v16i8.v16i1(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i32 1, i32 0, i32 1, <16 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqdmlsdhxq_m_s16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqdmlsdhxq_m_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqdmlsdhxt.s16 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call <8 x i16> @llvm.arm.mve.vqdmlad.predicated.v8i16.v8i1(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i32 1, i32 0, i32 1, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsdhxq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqdmlsdhxq_m_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqdmlsdhxt.s32 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x i32> @llvm.arm.mve.vqdmlad.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 1, i32 0, i32 1, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqrdmladhq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrdmladhq_m_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrdmladht.s8 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = tail call <16 x i8> @llvm.arm.mve.vqdmlad.predicated.v16i8.v16i1(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i32 0, i32 1, i32 0, <16 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrdmladhq_m_s16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrdmladhq_m_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrdmladht.s16 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call <8 x i16> @llvm.arm.mve.vqdmlad.predicated.v8i16.v8i1(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i32 0, i32 1, i32 0, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqrdmladhq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrdmladhq_m_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrdmladht.s32 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x i32> @llvm.arm.mve.vqdmlad.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 0, i32 1, i32 0, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqrdmladhxq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrdmladhxq_m_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrdmladhxt.s8 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = tail call <16 x i8> @llvm.arm.mve.vqdmlad.predicated.v16i8.v16i1(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i32 1, i32 1, i32 0, <16 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrdmladhxq_m_s16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrdmladhxq_m_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrdmladhxt.s16 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call <8 x i16> @llvm.arm.mve.vqdmlad.predicated.v8i16.v8i1(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i32 1, i32 1, i32 0, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqrdmladhxq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrdmladhxq_m_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrdmladhxt.s32 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x i32> @llvm.arm.mve.vqdmlad.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 1, i32 1, i32 0, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqrdmlsdhq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrdmlsdhq_m_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrdmlsdht.s8 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = tail call <16 x i8> @llvm.arm.mve.vqdmlad.predicated.v16i8.v16i1(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i32 0, i32 1, i32 1, <16 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrdmlsdhq_m_s16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrdmlsdhq_m_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrdmlsdht.s16 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call <8 x i16> @llvm.arm.mve.vqdmlad.predicated.v8i16.v8i1(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i32 0, i32 1, i32 1, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqrdmlsdhq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrdmlsdhq_m_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrdmlsdht.s32 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x i32> @llvm.arm.mve.vqdmlad.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 0, i32 1, i32 1, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vqrdmlsdhxq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrdmlsdhxq_m_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrdmlsdhxt.s8 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = tail call <16 x i8> @llvm.arm.mve.vqdmlad.predicated.v16i8.v16i1(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i32 1, i32 1, i32 1, <16 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqrdmlsdhxq_m_s16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrdmlsdhxq_m_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrdmlsdhxt.s16 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call <8 x i16> @llvm.arm.mve.vqdmlad.predicated.v8i16.v8i1(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i32 1, i32 1, i32 1, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqrdmlsdhxq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vqrdmlsdhxq_m_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqrdmlsdhxt.s32 q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x i32> @llvm.arm.mve.vqdmlad.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 1, i32 1, i32 1, <4 x i1> %1) + ret <4 x i32> %2 +} + +declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) +declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) +declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) + +declare <16 x i8> @llvm.arm.mve.vqdmlad.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, i32, i32, i32) +declare <8 x i16> @llvm.arm.mve.vqdmlad.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i32, i32, i32) +declare <4 x i32> @llvm.arm.mve.vqdmlad.v4i32(<4 x i32>, <4 x i32>, <4 x i32>, i32, i32, i32) +declare <16 x i8> @llvm.arm.mve.vqdmlad.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i8>, i32, i32, i32, <16 x i1>) +declare <8 x i16> @llvm.arm.mve.vqdmlad.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i16>, i32, i32, i32, <8 x i1>) +declare <4 x i32> @llvm.arm.mve.vqdmlad.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i32>, i32, i32, i32, <4 x i1>) diff --git a/llvm/test/CodeGen/X86/call-site-info-output.ll b/llvm/test/CodeGen/X86/call-site-info-output.ll index a0438f0c2b9859..0686f184b5262b 100644 --- a/llvm/test/CodeGen/X86/call-site-info-output.ll +++ b/llvm/test/CodeGen/X86/call-site-info-output.ll @@ -1,6 +1,6 @@ ; Test call site info MIR printer and parser.Parser assertions and machine ; verifier will check the rest; -; RUN: llc -emit-call-site-info -debug-entry-values %s -stop-before=finalize-isel -o %t.mir +; RUN: llc -emit-call-site-info %s -stop-before=finalize-isel -o %t.mir ; RUN: cat %t.mir | FileCheck %s ; CHECK: name: fn2 ; CHECK: callSites: @@ -10,7 +10,7 @@ ; CHECK-NEXT: arg: 0, reg: '$edi' ; CHECK-NEXT: arg: 1, reg: '$esi' ; CHECK-NEXT: arg: 2, reg: '$edx' -; RUN: llc -emit-call-site-info -debug-entry-values %t.mir -run-pass=finalize-isel -o -| FileCheck %s --check-prefix=PARSER +; RUN: llc -emit-call-site-info %t.mir -run-pass=finalize-isel -o -| FileCheck %s --check-prefix=PARSER ; Verify that we are able to parse output mir and that we are getting the same result. ; PARSER: name: fn2 ; PARSER: callSites: diff --git a/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll b/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll index ff483ab1760151..4f2859d4bffab0 100644 --- a/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll @@ -108,7 +108,7 @@ entry: ; CHECK: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.0) ; CHECK: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.1, align 16) ; CHECK: [[MOVSDrm_alt:%[0-9]+]]:fr64 = MOVSDrm_alt %fixed-stack.3, 1, $noreg, 0, $noreg :: (load 8 from %fixed-stack.3, align 16) -; CHECK: %3:fr64 = nofpexcept DIVSDrm [[MOVSDrm_alt]], %fixed-stack.2, 1, $noreg, 0, $noreg, implicit $mxcsr :: (load 8 from %fixed-stack.2) +; CHECK: %3:fr64 = DIVSDrm [[MOVSDrm_alt]], %fixed-stack.2, 1, $noreg, 0, $noreg, implicit $mxcsr :: (load 8 from %fixed-stack.2) ; CHECK: MOVSDmr killed [[MOV32rm1]], 1, $noreg, 0, $noreg, %3 :: (store 8 into %ir.x, align 4) ; CHECK: MOVSDmr killed [[MOV32rm]], 1, $noreg, 0, $noreg, %3 :: (store 8 into %ir.y, align 4) ; CHECK: RET 0 @@ -126,7 +126,7 @@ entry: ; CHECK-LABEL: name: sitofp_cse ; CHECK: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.0, align 8) ; CHECK: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.1) -; CHECK: %2:fr64 = nofpexcept CVTSI2SDrm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.2, align 16) +; CHECK: %2:fr64 = CVTSI2SDrm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.2, align 16) ; CHECK: MOVSDmr killed [[MOV32rm1]], 1, $noreg, 0, $noreg, %2 :: (store 8 into %ir.x, align 4) ; CHECK: MOVSDmr killed [[MOV32rm]], 1, $noreg, 0, $noreg, %2 :: (store 8 into %ir.y, align 4) ; CHECK: RET 0 diff --git a/llvm/test/DebugInfo/AArch64/dbgcall-site-float-entry-value.ll b/llvm/test/DebugInfo/AArch64/dbgcall-site-float-entry-value.ll index a925cd0c1452d3..3b91d17dc62840 100644 --- a/llvm/test/DebugInfo/AArch64/dbgcall-site-float-entry-value.ll +++ b/llvm/test/DebugInfo/AArch64/dbgcall-site-float-entry-value.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple aarch64-linux-gnu -emit-call-site-info -debug-entry-values -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s +; RUN: llc -mtriple aarch64-linux-gnu -emit-call-site-info -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s ; Based on the following C reproducer: ; diff --git a/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-orr-moves.mir b/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-orr-moves.mir index ed2a7d903885bb..e3ee2cac4fa98a 100644 --- a/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-orr-moves.mir +++ b/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-orr-moves.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -start-after=livedebugvalues -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s +# RUN: llc -emit-call-site-info -start-after=livedebugvalues -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s # Based on the following C reproducer: # diff --git a/llvm/test/DebugInfo/MIR/ARM/call-site-info-vmovd.mir b/llvm/test/DebugInfo/MIR/ARM/call-site-info-vmovd.mir index bd42c229d29c9e..8f7f789eb8fb77 100644 --- a/llvm/test/DebugInfo/MIR/ARM/call-site-info-vmovd.mir +++ b/llvm/test/DebugInfo/MIR/ARM/call-site-info-vmovd.mir @@ -1,4 +1,4 @@ -# RUN: llc -O1 -emit-call-site-info -debug-entry-values -filetype=obj -mtriple thumbv7em-apple-unknown-macho -start-after=machineverifier %s -o %t.o +# RUN: llc -O1 -emit-call-site-info -filetype=obj -mtriple thumbv7em-apple-unknown-macho -start-after=machineverifier %s -o %t.o # RUN: llvm-dwarfdump %t.o | FileCheck %s # Crash test, reduced from: diff --git a/llvm/test/DebugInfo/MIR/ARM/call-site-info-vmovs.mir b/llvm/test/DebugInfo/MIR/ARM/call-site-info-vmovs.mir index 2cf7e4d1c87fc1..e3f1031796a049 100644 --- a/llvm/test/DebugInfo/MIR/ARM/call-site-info-vmovs.mir +++ b/llvm/test/DebugInfo/MIR/ARM/call-site-info-vmovs.mir @@ -1,4 +1,4 @@ -# RUN: llc -O1 -emit-call-site-info -debug-entry-values -filetype=obj -mtriple thumbv7em-apple-unknown-macho -start-after=machineverifier %s -o %t.o +# RUN: llc -O1 -emit-call-site-info -filetype=obj -mtriple thumbv7em-apple-unknown-macho -start-after=machineverifier %s -o %t.o # RUN: llvm-dwarfdump %t.o | FileCheck %s # Crash test, reduced from: diff --git a/llvm/test/DebugInfo/MIR/ARM/dbgcall-site-propagated-value.mir b/llvm/test/DebugInfo/MIR/ARM/dbgcall-site-propagated-value.mir index 5b84d9e9627f1c..b25b1c90d6348c 100644 --- a/llvm/test/DebugInfo/MIR/ARM/dbgcall-site-propagated-value.mir +++ b/llvm/test/DebugInfo/MIR/ARM/dbgcall-site-propagated-value.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -run-pass=livedebugvalues -o - %s | FileCheck %s +# RUN: llc -run-pass=livedebugvalues -o - %s | FileCheck %s # Based on the following C reproducer: # @@ -106,10 +106,6 @@ name: caller alignment: 4 tracksRegLiveness: true -callSites: - - { bb: 0, offset: 6 } - - { bb: 0, offset: 9, fwdArgRegs: - - { arg: 0, reg: '$r0' } } body: | bb.0: liveins: $lr diff --git a/llvm/test/DebugInfo/MIR/Hexagon/dbgcall-site-instr-before-bundled-call.mir b/llvm/test/DebugInfo/MIR/Hexagon/dbgcall-site-instr-before-bundled-call.mir index 3ae23d4189bf18..9baa815a0458bb 100644 --- a/llvm/test/DebugInfo/MIR/Hexagon/dbgcall-site-instr-before-bundled-call.mir +++ b/llvm/test/DebugInfo/MIR/Hexagon/dbgcall-site-instr-before-bundled-call.mir @@ -1,4 +1,6 @@ -# RUN: llc -mtriple hexagon -emit-call-site-info -debug-entry-values -start-after=machineverifier -filetype=obj %s -o - | llvm-dwarfdump - | FileCheck %s +# We do not support the call site info for the target now, so we use the experimental option (-emit-call-site-info -debug-entry-values). + +# RUN: llc -emit-call-site-info -debug-entry-values -mtriple hexagon -start-after=machineverifier -filetype=obj %s -o - | llvm-dwarfdump - | FileCheck %s # Based on the following C reproducer: # diff --git a/llvm/test/DebugInfo/MIR/Hexagon/live-debug-values-bundled-entry-values.mir b/llvm/test/DebugInfo/MIR/Hexagon/live-debug-values-bundled-entry-values.mir index 8bb0b3202acd3d..2ed3672c2ec302 100644 --- a/llvm/test/DebugInfo/MIR/Hexagon/live-debug-values-bundled-entry-values.mir +++ b/llvm/test/DebugInfo/MIR/Hexagon/live-debug-values-bundled-entry-values.mir @@ -1,3 +1,5 @@ +# We do not support the call site info for the target now, so we use the experimental option (-emit-call-site-info -debug-entry-values). + # RUN: llc -emit-call-site-info -debug-entry-values -run-pass=livedebugvalues -o - %s | FileCheck %s # Verify that the entry values for the input parameters are inserted after the diff --git a/llvm/test/DebugInfo/MIR/SystemZ/call-site-lzer.mir b/llvm/test/DebugInfo/MIR/SystemZ/call-site-lzer.mir index 3cf41467f7f9f7..f173c9d780fa9a 100644 --- a/llvm/test/DebugInfo/MIR/SystemZ/call-site-lzer.mir +++ b/llvm/test/DebugInfo/MIR/SystemZ/call-site-lzer.mir @@ -1,3 +1,5 @@ +# We do not support the call site info for the target now, so we use the experimental option (-emit-call-site-info -debug-entry-values). + # RUN: llc -emit-call-site-info -debug-entry-values -start-after=livedebugvalues -o - %s | FileCheck %s # This test would previously trigger an assertion when trying to describe the diff --git a/llvm/test/DebugInfo/MIR/X86/DW_OP_entry_value.mir b/llvm/test/DebugInfo/MIR/X86/DW_OP_entry_value.mir index 4e5a07321d4280..f7f74b628d1660 100644 --- a/llvm/test/DebugInfo/MIR/X86/DW_OP_entry_value.mir +++ b/llvm/test/DebugInfo/MIR/X86/DW_OP_entry_value.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -start-before=livedebugvalues -mtriple=x86_64-apple-darwin -o %t %s -filetype=obj +# RUN: llc -start-before=livedebugvalues -mtriple=x86_64-apple-darwin -o %t %s -filetype=obj # RUN: llvm-dwarfdump %t | FileCheck %s # # int global; diff --git a/llvm/test/DebugInfo/MIR/X86/call-site-gnu-vs-dwarf5-attrs.mir b/llvm/test/DebugInfo/MIR/X86/call-site-gnu-vs-dwarf5-attrs.mir index 891fbb60b36e6a..63d03050918f43 100644 --- a/llvm/test/DebugInfo/MIR/X86/call-site-gnu-vs-dwarf5-attrs.mir +++ b/llvm/test/DebugInfo/MIR/X86/call-site-gnu-vs-dwarf5-attrs.mir @@ -1,17 +1,21 @@ # Test the call site encoding in DWARF5 vs GNU extensions. # -# RUN: llc -dwarf-version 4 -debugger-tune=gdb -emit-call-site-info -debug-entry-values -filetype=obj \ +# RUN: llc -emit-call-site-info -dwarf-version 4 -debugger-tune=gdb -filetype=obj \ # RUN: -mtriple=x86_64-unknown-unknown -start-after=machineverifier -o - %s \ # RUN: | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-GNU # -# RUN: llc -dwarf-version 5 -debugger-tune=lldb -emit-call-site-info -debug-entry-values -filetype=obj \ +# RUN: llc -emit-call-site-info -dwarf-version 5 -debugger-tune=lldb -filetype=obj \ # RUN: -mtriple=x86_64-unknown-unknown -start-after=machineverifier -o - %s \ # RUN: | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-DWARF5 # -# RUN: llc -dwarf-version 5 -emit-call-site-info -debug-entry-values -filetype=obj \ +# RUN: llc -emit-call-site-info -dwarf-version 5 -filetype=obj \ # RUN: -mtriple=x86_64-unknown-unknown -start-after=machineverifier -o - %s \ # RUN: | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-DWARF5 # +# RUN: llc -emit-call-site-info -dwarf-version 5 -filetype=obj -debugger-tune=sce \ +# RUN: -emit-debug-entry-values -debug-entry-values -mtriple=x86_64-unknown-unknown \ +# RUN: -start-after=machineverifier -o - %s | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-DWARF5 +# # This is based on the following reproducer: # # extern void fn(); diff --git a/llvm/test/DebugInfo/MIR/X86/callsite-stack-value.mir b/llvm/test/DebugInfo/MIR/X86/callsite-stack-value.mir index 6d69f063020295..5b9ecf08150be7 100644 --- a/llvm/test/DebugInfo/MIR/X86/callsite-stack-value.mir +++ b/llvm/test/DebugInfo/MIR/X86/callsite-stack-value.mir @@ -1,5 +1,5 @@ # RUN: llc -start-after=livedebugvalues -mtriple=x86_64-apple-darwin -o - %s -filetype=obj \ -# RUN: -emit-call-site-info -debug-entry-values | llvm-dwarfdump - | FileCheck %s -implicit-check-not=call_site_parameter +# RUN: -emit-call-site-info | llvm-dwarfdump - | FileCheck %s -implicit-check-not=call_site_parameter # CHECK: DW_TAG_formal_parameter # CHECK-NEXT: DW_AT_location diff --git a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-copy-super-sub.mir b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-copy-super-sub.mir index 01a2b887a60b63..347a0ec09bb24d 100644 --- a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-copy-super-sub.mir +++ b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-copy-super-sub.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -start-after=livedebugvalues -filetype=obj %s -o -| llvm-dwarfdump -| FileCheck %s +# RUN: llc -emit-call-site-info -start-after=livedebugvalues -filetype=obj %s -o -| llvm-dwarfdump -| FileCheck %s # Based on the following reproducer: # diff --git a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-interpretation.mir b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-interpretation.mir index d6c6b30184622f..b142313871eb1e 100644 --- a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-interpretation.mir +++ b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-interpretation.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -start-after=machineverifier -filetype=obj %s -o -| llvm-dwarfdump -| FileCheck %s +# RUN: llc -emit-call-site-info -start-after=machineverifier -filetype=obj %s -o -| llvm-dwarfdump -| FileCheck %s # # CHECK: DW_TAG_GNU_call_site # CHECK-NEXT: DW_AT_abstract_origin {{.*}} "foo" diff --git a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-lea-interpretation.mir b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-lea-interpretation.mir index 4d88fa9aab74d9..79e40b65c42086 100644 --- a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-lea-interpretation.mir +++ b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-lea-interpretation.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -start-after=machineverifier -filetype=obj %s -o -| llvm-dwarfdump -| FileCheck %s +# RUN: llc -emit-call-site-info -start-after=machineverifier -filetype=obj %s -o -| llvm-dwarfdump -| FileCheck %s # CHECK: DW_TAG_GNU_call_site # CHECK-NEXT: DW_AT_abstract_origin {{.*}} "foo") # CHECK-NEXT: DW_AT_low_pc {{.*}} diff --git a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-partial-describe.mir b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-partial-describe.mir index b97785d650c943..f0902bbe41d660 100644 --- a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-partial-describe.mir +++ b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-partial-describe.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -start-before=livedebugvalues -filetype=obj -o - %s \ +# RUN: llc -emit-call-site-info -start-before=livedebugvalues -filetype=obj -o - %s \ # RUN: | llvm-dwarfdump - | FileCheck %s --implicit-check-not=DW_TAG_GNU_call_site_parameter --- | diff --git a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-reference.mir b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-reference.mir index 81af598ba19424..73927772ca085f 100644 --- a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-reference.mir +++ b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-reference.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -start-before=livedebugvalues -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s +# RUN: llc -start-before=livedebugvalues -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s # Based on the following C++ code: # struct A { A(A &) {} }; diff --git a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-reg-shuffle.mir b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-reg-shuffle.mir index 1baf66393c49d6..27a03193e81615 100644 --- a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-reg-shuffle.mir +++ b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-reg-shuffle.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -start-before=livedebugvalues -filetype=obj -o - %s \ +# RUN: llc -emit-call-site-info -start-before=livedebugvalues -filetype=obj -o - %s \ # RUN: | llvm-dwarfdump - | FileCheck %s --implicit-check-not=DW_TAG_GNU_call_site_parameter --- | diff --git a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-two-fwd-reg-defs.mir b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-two-fwd-reg-defs.mir index ac97da66a397cd..c1bdbd0783acd9 100644 --- a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-two-fwd-reg-defs.mir +++ b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-two-fwd-reg-defs.mir @@ -1,4 +1,4 @@ -# RUN: llc -O1 -emit-call-site-info -debug-entry-values -start-after=livedebugvalues -filetype=obj %s -o - | llvm-dwarfdump - | FileCheck %s +# RUN: llc -O1 -emit-call-site-info -start-after=livedebugvalues -filetype=obj %s -o - | llvm-dwarfdump - | FileCheck %s # Based on the following C reproducer: # diff --git a/llvm/test/DebugInfo/MIR/X86/dbginfo-entryvals.mir b/llvm/test/DebugInfo/MIR/X86/dbginfo-entryvals.mir index 9346b513cf481b..302cce20a15ac1 100644 --- a/llvm/test/DebugInfo/MIR/X86/dbginfo-entryvals.mir +++ b/llvm/test/DebugInfo/MIR/X86/dbginfo-entryvals.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -run-pass=livedebugvalues -verify-machineinstrs -march=x86-64 -o - %s | FileCheck %s +# RUN: llc -run-pass=livedebugvalues -verify-machineinstrs -march=x86-64 -o - %s | FileCheck %s # #extern void fn2(int); # diff --git a/llvm/test/DebugInfo/MIR/X86/debug-call-site-param.mir b/llvm/test/DebugInfo/MIR/X86/debug-call-site-param.mir index 2a78919afd4385..c5ca4f1b2a1477 100644 --- a/llvm/test/DebugInfo/MIR/X86/debug-call-site-param.mir +++ b/llvm/test/DebugInfo/MIR/X86/debug-call-site-param.mir @@ -2,8 +2,8 @@ # When the debugger tuning is set to gdb, use GNU opcodes. # For lldb, use the standard DWARF5 opcodes. -# RUN: llc -emit-call-site-info -debug-entry-values -debugger-tune=gdb -filetype=obj -mtriple=x86_64-unknown-unknown -start-after=machineverifier -o - %s | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-GNU -# RUN: llc -emit-call-site-info -debug-entry-values -debugger-tune=lldb -filetype=obj -mtriple=x86_64-unknown-unknown -start-after=machineverifier -o - %s | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-DWARF5 +# RUN: llc -emit-call-site-info -debugger-tune=gdb -filetype=obj -mtriple=x86_64-unknown-unknown -start-after=machineverifier -o - %s | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-GNU +# RUN: llc -emit-call-site-info -debugger-tune=lldb -filetype=obj -mtriple=x86_64-unknown-unknown -start-after=machineverifier -o - %s | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-DWARF5 # # extern void foo(int *a, int b, int c, int d, int e, int f); # extern int getVal(); diff --git a/llvm/test/DebugInfo/MIR/X86/entry-value-of-modified-param.mir b/llvm/test/DebugInfo/MIR/X86/entry-value-of-modified-param.mir index 541a2155578ec6..0dd63ae98009ca 100644 --- a/llvm/test/DebugInfo/MIR/X86/entry-value-of-modified-param.mir +++ b/llvm/test/DebugInfo/MIR/X86/entry-value-of-modified-param.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -run-pass=livedebugvalues -march=x86-64 -o - %s | FileCheck %s +# RUN: llc -run-pass=livedebugvalues -march=x86-64 -o - %s | FileCheck %s # #extern void fn1 (int, int, int); # @@ -85,11 +85,6 @@ --- name: fn2 alignment: 16 -callSites: - - { bb: 0, offset: 14, fwdArgRegs: - - { arg: 0, reg: '$edi' } - - { arg: 1, reg: '$esi' } - - { arg: 2, reg: '$edx' } } body: | bb.0.entry: liveins: $edi, $esi, $rbx diff --git a/llvm/test/DebugInfo/MIR/X86/entry-values-diamond-bbs.mir b/llvm/test/DebugInfo/MIR/X86/entry-values-diamond-bbs.mir index 042d76058a2282..fc7bd93d0223c1 100644 --- a/llvm/test/DebugInfo/MIR/X86/entry-values-diamond-bbs.mir +++ b/llvm/test/DebugInfo/MIR/X86/entry-values-diamond-bbs.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -run-pass=livedebugvalues -march=x86-64 -o - %s | FileCheck %s +# RUN: llc -run-pass=livedebugvalues -march=x86-64 -o - %s | FileCheck %s # # The test case was artificially adjusted, in order to make proper diamond basic # block structure relevant to the debug entry values propagation. diff --git a/llvm/test/DebugInfo/MIR/X86/propagate-entry-value-cross-bbs.mir b/llvm/test/DebugInfo/MIR/X86/propagate-entry-value-cross-bbs.mir index c5af863954bfb3..34f80f5ca2a321 100644 --- a/llvm/test/DebugInfo/MIR/X86/propagate-entry-value-cross-bbs.mir +++ b/llvm/test/DebugInfo/MIR/X86/propagate-entry-value-cross-bbs.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -run-pass=livedebugvalues -march=x86-64 -o - %s | FileCheck %s +# RUN: llc -run-pass=livedebugvalues -march=x86-64 -o - %s | FileCheck %s # #extern void fn1 (int, int, int); #__attribute__((noinline)) @@ -110,15 +110,6 @@ --- name: fn2 alignment: 16 -callSites: - - { bb: 0, offset: 20, fwdArgRegs: - - { arg: 0, reg: '$edi' } - - { arg: 1, reg: '$esi' } - - { arg: 2, reg: '$edx' } } - - { bb: 3, offset: 2, fwdArgRegs: - - { arg: 0, reg: '$edi' } - - { arg: 1, reg: '$esi' } - - { arg: 2, reg: '$edx' } } body: | bb.0.entry: successors: %bb.1(0x40000000), %bb.2(0x40000000) diff --git a/llvm/test/DebugInfo/MIR/X86/unreachable-block-call-site.mir b/llvm/test/DebugInfo/MIR/X86/unreachable-block-call-site.mir index ea9c12b5a192a8..bfc5c2be127e7f 100644 --- a/llvm/test/DebugInfo/MIR/X86/unreachable-block-call-site.mir +++ b/llvm/test/DebugInfo/MIR/X86/unreachable-block-call-site.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=x86_64-pc-linux -emit-call-site-info -debug-entry-values -run-pass=unreachable-mbb-elimination -o - %s | FileCheck %s +# RUN: llc -emit-call-site-info -mtriple=x86_64-pc-linux -run-pass=unreachable-mbb-elimination -o - %s | FileCheck %s # Verify that the call site information for the call residing in the eliminated # block is removed. This test case would previously trigger an assertion when diff --git a/llvm/test/DebugInfo/X86/arange.ll b/llvm/test/DebugInfo/X86/arange.ll index f9facc795f541c..49090bfc610756 100644 --- a/llvm/test/DebugInfo/X86/arange.ll +++ b/llvm/test/DebugInfo/X86/arange.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -mtriple=x86_64-linux -O0 -filetype=obj -generate-arange-section < %s | llvm-dwarfdump -debug-aranges - | FileCheck %s ; RUN: llc -mtriple=x86_64-linux -O0 -filetype=obj -generate-arange-section < %s | llvm-readobj --relocations - | FileCheck --check-prefix=OBJ %s diff --git a/llvm/test/DebugInfo/X86/arguments.ll b/llvm/test/DebugInfo/X86/arguments.ll index 05b2981439bc97..8f030867c3d988 100644 --- a/llvm/test/DebugInfo/X86/arguments.ll +++ b/llvm/test/DebugInfo/X86/arguments.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -mtriple=x86_64-unknown-unknown -O0 -filetype=obj < %s > %t ; RUN: llvm-dwarfdump %t | FileCheck %s diff --git a/llvm/test/DebugInfo/X86/c-type-units.ll b/llvm/test/DebugInfo/X86/c-type-units.ll index 844823b2cda892..889bd79d27dec1 100644 --- a/llvm/test/DebugInfo/X86/c-type-units.ll +++ b/llvm/test/DebugInfo/X86/c-type-units.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -o - %s -filetype=obj -O0 -debugger-tune=lldb -generate-type-units -mtriple=x86_64-unknown-linux-gnu | llvm-dwarfdump -debug-types - | FileCheck %s diff --git a/llvm/test/DebugInfo/X86/dbg-value-range.ll b/llvm/test/DebugInfo/X86/dbg-value-range.ll index e0cfe5f15ee959..9159d2aac780c2 100644 --- a/llvm/test/DebugInfo/X86/dbg-value-range.ll +++ b/llvm/test/DebugInfo/X86/dbg-value-range.ll @@ -56,6 +56,6 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone ;CHECK-NEXT: .quad [[CLOBBER_OFF]] ;CHECK-NEXT: .short 1 ## Loc expr size ;CHECK-NEXT: .byte 85 ## DW_OP_reg -;CHECK-NEXT: .quad 0 +;CHECK: .quad 0 ;CHECK-NEXT: .quad 0 !24 = !{i32 1, !"Debug Info Version", i32 3} diff --git a/llvm/test/DebugInfo/X86/dbg-value-regmask-clobber.ll b/llvm/test/DebugInfo/X86/dbg-value-regmask-clobber.ll index 440498a9d8dd45..425a6cb38c4105 100644 --- a/llvm/test/DebugInfo/X86/dbg-value-regmask-clobber.ll +++ b/llvm/test/DebugInfo/X86/dbg-value-regmask-clobber.ll @@ -9,8 +9,7 @@ ; ASM: movl $1, x(%rip) ; ASM: callq clobber ; ASM-NEXT: [[argc_range_end:.Ltmp[0-9]+]]: -; Previously LiveDebugValues would claim argc was still in ecx after the call. -; ASM-NOT: #DEBUG_VALUE: main:argc +; ASM: #DEBUG_VALUE: main:argc <- [DW_OP_LLVM_entry_value 1] $ecx ; argc is the first debug location. ; ASM: .Ldebug_loc1: @@ -23,7 +22,8 @@ ; DWARF: .debug_info contents: ; DWARF: DW_TAG_formal_parameter ; DWARF-NEXT: DW_AT_location ({{0x.*}} -; DWARF-NEXT: [0x0000000000000000, 0x0000000000000013): DW_OP_reg2 RCX) +; DWARF-NEXT: [0x0000000000000000, 0x0000000000000013): DW_OP_reg2 RCX +; DWARF-NEXT: [0x0000000000000013, 0x0000000000000043): DW_OP_GNU_entry_value(DW_OP_reg2 RCX), DW_OP_stack_value ; DWARF-NEXT: DW_AT_name ("argc") ; ModuleID = 't.cpp' diff --git a/llvm/test/DebugInfo/X86/dbgcall-site-64-bit-imms.ll b/llvm/test/DebugInfo/X86/dbgcall-site-64-bit-imms.ll index b8cd9574cc63d6..f12dfa6196c1fb 100644 --- a/llvm/test/DebugInfo/X86/dbgcall-site-64-bit-imms.ll +++ b/llvm/test/DebugInfo/X86/dbgcall-site-64-bit-imms.ll @@ -1,4 +1,4 @@ -; RUN: llc -O1 -emit-call-site-info -debug-entry-values -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s +; RUN: llc -emit-call-site-info -O1 -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s ; Verify that the 64-bit call site immediates are not truncated. ; diff --git a/llvm/test/DebugInfo/X86/dbgcall-site-zero-valued-imms.ll b/llvm/test/DebugInfo/X86/dbgcall-site-zero-valued-imms.ll index 5d37774f55d6ff..dc8c418117c75a 100644 --- a/llvm/test/DebugInfo/X86/dbgcall-site-zero-valued-imms.ll +++ b/llvm/test/DebugInfo/X86/dbgcall-site-zero-valued-imms.ll @@ -1,4 +1,4 @@ -; RUN: llc -O3 -emit-call-site-info -debug-entry-values -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s +; RUN: llc -emit-call-site-info -O3 -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/DebugInfo/X86/debug-loc-frame.ll b/llvm/test/DebugInfo/X86/debug-loc-frame.ll index 653ba1f3eb688a..83c678ccedf97a 100644 --- a/llvm/test/DebugInfo/X86/debug-loc-frame.ll +++ b/llvm/test/DebugInfo/X86/debug-loc-frame.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; Check that when variables are allocated on the stack we generate debug locations ; for the stack location directly instead of generating a register+offset indirection. diff --git a/llvm/test/DebugInfo/X86/debug-names-ir-disabled.ll b/llvm/test/DebugInfo/X86/debug-names-ir-disabled.ll index c1f1dc99ede563..6502c86a20b8e6 100644 --- a/llvm/test/DebugInfo/X86/debug-names-ir-disabled.ll +++ b/llvm/test/DebugInfo/X86/debug-names-ir-disabled.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; Verify that no DWARF v5 names section is emitted when all CUs disable name tables. ; RUN: llc -mtriple x86_64-pc-linux -filetype=obj < %s \ diff --git a/llvm/test/DebugInfo/X86/debug-names-partial.ll b/llvm/test/DebugInfo/X86/debug-names-partial.ll index 28ee59ea716234..50a21d6b9cdc86 100644 --- a/llvm/test/DebugInfo/X86/debug-names-partial.ll +++ b/llvm/test/DebugInfo/X86/debug-names-partial.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; Verify that DWARF v5 debug_names omit names from CUs that opt-out. ; RUN: llc -mtriple x86_64-pc-linux -filetype=obj < %s \ diff --git a/llvm/test/DebugInfo/X86/debug-names-split-dwarf.ll b/llvm/test/DebugInfo/X86/debug-names-split-dwarf.ll index 26687e8143ceac..66520395dadf0b 100644 --- a/llvm/test/DebugInfo/X86/debug-names-split-dwarf.ll +++ b/llvm/test/DebugInfo/X86/debug-names-split-dwarf.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; Verify that DWARF v5 accelerator tables work with split-dwarf. ; RUN: llc -mtriple x86_64-pc-linux -split-dwarf-file=foo.dwo \ diff --git a/llvm/test/DebugInfo/X86/decl-derived-member.ll b/llvm/test/DebugInfo/X86/decl-derived-member.ll index acb39f4e8f6cb2..2d5ca1a87f9b5a 100644 --- a/llvm/test/DebugInfo/X86/decl-derived-member.ll +++ b/llvm/test/DebugInfo/X86/decl-derived-member.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -mtriple x86_64-pc-linux -O0 -filetype=obj %s -o %t ; RUN: llvm-dwarfdump %t | FileCheck %s diff --git a/llvm/test/DebugInfo/X86/dwarf-callsite-related-attrs.ll b/llvm/test/DebugInfo/X86/dwarf-callsite-related-attrs.ll index c019d75c0dbfda..931f77c72f7885 100644 --- a/llvm/test/DebugInfo/X86/dwarf-callsite-related-attrs.ll +++ b/llvm/test/DebugInfo/X86/dwarf-callsite-related-attrs.ll @@ -12,7 +12,6 @@ ; and fail with "failed to compute relocation: IMAGE_REL_AMD64_ADDR32". ; UNSUPPORTED: cygwin,windows-gnu,windows-msvc -; REQUIRES: x86 ; RUN: %llc_dwarf -mtriple=x86_64-- < %s -o - | FileCheck %s -check-prefix=ASM ; RUN: %llc_dwarf -debugger-tune=lldb -mtriple=x86_64-- < %s -filetype=obj -o %t.o ; RUN: llvm-dwarfdump %t.o -o - | FileCheck %s -check-prefix=OBJ -implicit-check-not=DW_TAG_call_site diff --git a/llvm/test/DebugInfo/X86/generate-odr-hash.ll b/llvm/test/DebugInfo/X86/generate-odr-hash.ll index 68dcfda2254b00..9fa954cd24858f 100644 --- a/llvm/test/DebugInfo/X86/generate-odr-hash.ll +++ b/llvm/test/DebugInfo/X86/generate-odr-hash.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc < %s -o %t -filetype=obj -O0 -generate-type-units -mtriple=x86_64-unknown-linux-gnu ; RUN: llvm-dwarfdump -v %t | FileCheck --check-prefix=CHECK --check-prefix=SINGLE %s diff --git a/llvm/test/DebugInfo/X86/ghost-sdnode-dbgvalues.ll b/llvm/test/DebugInfo/X86/ghost-sdnode-dbgvalues.ll index 8796851e593cf6..f80b3ac1622408 100644 --- a/llvm/test/DebugInfo/X86/ghost-sdnode-dbgvalues.ll +++ b/llvm/test/DebugInfo/X86/ghost-sdnode-dbgvalues.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -mtriple=x86_64-apple-macosx10.10.0 -o %t %s diff --git a/llvm/test/DebugInfo/X86/inline-member-function.ll b/llvm/test/DebugInfo/X86/inline-member-function.ll index 76f1d86777ac26..31cc5b0fa5cff1 100644 --- a/llvm/test/DebugInfo/X86/inline-member-function.ll +++ b/llvm/test/DebugInfo/X86/inline-member-function.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -mtriple=x86_64-linux -O0 -filetype=obj < %s | llvm-dwarfdump -v -debug-info - | FileCheck %s diff --git a/llvm/test/DebugInfo/X86/lexical_block.ll b/llvm/test/DebugInfo/X86/lexical_block.ll index a08cb0346c124f..1af231e8dfe0a9 100644 --- a/llvm/test/DebugInfo/X86/lexical_block.ll +++ b/llvm/test/DebugInfo/X86/lexical_block.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -mtriple=x86_64-linux -O0 -filetype=obj < %s \ ; RUN: | llvm-dwarfdump -v -debug-info - | FileCheck --check-prefix=CHECK --check-prefix=CHECK-V4 %s diff --git a/llvm/test/DebugInfo/X86/loclists-dwp.ll b/llvm/test/DebugInfo/X86/loclists-dwp.ll index 91f83887633865..a972c8094c5f15 100644 --- a/llvm/test/DebugInfo/X86/loclists-dwp.ll +++ b/llvm/test/DebugInfo/X86/loclists-dwp.ll @@ -19,10 +19,12 @@ ; void b(int i) { asm("" : : : "rdi"); } ; CHECK: DW_AT_location [DW_FORM_sec_offset] (0x00000000 -; CHECK-NEXT: DW_LLE_startx_length (0x0000000000000000, 0x0000000000000006): DW_OP_reg5 RDI) +; CHECK-NEXT: DW_LLE_startx_length (0x0000000000000000, 0x0000000000000006): DW_OP_reg5 RDI +; CHECK-NEXT: DW_LLE_startx_length (0x0000000000000001, 0x0000000000000002): DW_OP_GNU_entry_value(DW_OP_reg5 RDI), DW_OP_stack_value) ; CHECK: DW_AT_location [DW_FORM_sec_offset] (0x00000000 -; CHECK-NEXT: DW_LLE_startx_length (0x0000000000000000, 0x0000000000000000): DW_OP_reg5 RDI) +; CHECK-NEXT: DW_LLE_startx_length (0x0000000000000000, 0x0000000000000000): DW_OP_reg5 RDI +; CHECK-NEXT: DW_LLE_startx_length (0x0000000000000001, 0x0000000000000001): DW_OP_GNU_entry_value(DW_OP_reg5 RDI), DW_OP_stack_value) target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/DebugInfo/X86/missing-file-line.ll b/llvm/test/DebugInfo/X86/missing-file-line.ll index 24cc418c43976c..08f6f1529040ef 100644 --- a/llvm/test/DebugInfo/X86/missing-file-line.ll +++ b/llvm/test/DebugInfo/X86/missing-file-line.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -mtriple=x86_64-linux-gnu -filetype=obj %s -o - | llvm-dwarfdump -all - | FileCheck %s diff --git a/llvm/test/DebugInfo/X86/no-entry-values-with-O0.ll b/llvm/test/DebugInfo/X86/no-entry-values-with-O0.ll new file mode 100644 index 00000000000000..8ba22b7b6e5103 --- /dev/null +++ b/llvm/test/DebugInfo/X86/no-entry-values-with-O0.ll @@ -0,0 +1,88 @@ +; RUN: llc -O0 -dwarf-version=5 -debugger-tune=lldb -march=x86-64 -filetype=obj < %s \ +; RUN: | llvm-dwarfdump - | FileCheck --implicit-check-not=DW_OP_entry_value %s +; RUN: llc -O0 -dwarf-version=5 -debugger-tune=gdb -march=x86-64 -filetype=obj < %s \ +; RUN: | llvm-dwarfdump - | FileCheck --implicit-check-not=DW_OP_entry_value %s + +; The call-site-params are created iff corresponding DISubprogram contains +; the AllCallsDescribed DIFlag. +; CHECK-NOT: DW_TAG_call_site_param + +; Genarated with: +; clang -gdwarf-5 -O0 test.c -S -emit-llvm +; +; ModuleID = 'test.c' +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @fn1(i32 %x, i32 %y) !dbg !7 { +entry: + %x.addr = alloca i32, align 4 + %y.addr = alloca i32, align 4 + %u = alloca i32, align 4 + %a = alloca i32, align 4 + store i32 %x, i32* %x.addr, align 4 + call void @llvm.dbg.declare(metadata i32* %x.addr, metadata !11, metadata !DIExpression()), !dbg !12 + store i32 %y, i32* %y.addr, align 4 + call void @llvm.dbg.declare(metadata i32* %y.addr, metadata !13, metadata !DIExpression()), !dbg !14 + call void @llvm.dbg.declare(metadata i32* %u, metadata !15, metadata !DIExpression()), !dbg !16 + %0 = load i32, i32* %x.addr, align 4, !dbg !16 + %1 = load i32, i32* %y.addr, align 4, !dbg !16 + %add = add nsw i32 %0, %1, !dbg !16 + store i32 %add, i32* %u, align 4, !dbg !16 + %2 = load i32, i32* %x.addr, align 4, !dbg !17 + %cmp = icmp sgt i32 %2, 1, !dbg !17 + br i1 %cmp, label %if.then, label %if.else, !dbg !16 + +if.then: ; preds = %entry + %3 = load i32, i32* %u, align 4, !dbg !17 + %add1 = add nsw i32 %3, 1, !dbg !17 + store i32 %add1, i32* %u, align 4, !dbg !17 + br label %if.end, !dbg !17 + +if.else: ; preds = %entry + %4 = load i32, i32* %u, align 4, !dbg !17 + %add2 = add nsw i32 %4, 2, !dbg !17 + store i32 %add2, i32* %u, align 4, !dbg !17 + br label %if.end + +if.end: ; preds = %if.else, %if.then + call void @llvm.dbg.declare(metadata i32* %a, metadata !19, metadata !DIExpression()), !dbg !16 + store i32 7, i32* %a, align 4, !dbg !16 + %5 = load i32, i32* %a, align 4, !dbg !16 + call void @fn2(i32 %5), !dbg !16 + %6 = load i32, i32* %u, align 4, !dbg !16 + %dec = add nsw i32 %6, -1, !dbg !16 + store i32 %dec, i32* %u, align 4, !dbg !16 + ret void, !dbg !16 +} + +; Function Attrs: nounwind readnone speculatable willreturn +declare void @llvm.dbg.declare(metadata, metadata, metadata) + +declare dso_local void @fn2(i32) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "test.c", directory: "/") +!2 = !{} +!3 = !{i32 7, !"Dwarf Version", i32 5} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!"clang version 11.0.0"} +!7 = distinct !DISubprogram(name: "fn1", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2) +!8 = !DISubroutineType(types: !9) +!9 = !{null, !10, !10} +!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!11 = !DILocalVariable(name: "x", arg: 1, scope: !7, file: !1, line: 5, type: !10) +!12 = !DILocation(line: 5, column: 10, scope: !7) +!13 = !DILocalVariable(name: "y", arg: 2, scope: !7, file: !1, line: 5, type: !10) +!14 = !DILocation(line: 5, column: 17, scope: !7) +!15 = !DILocalVariable(name: "u", scope: !7, file: !1, line: 6, type: !10) +!16 = !DILocation(line: 6, column: 7, scope: !7) +!17 = !DILocation(line: 7, column: 7, scope: !18) +!18 = distinct !DILexicalBlock(scope: !7, file: !1, line: 7, column: 7) +!19 = !DILocalVariable(name: "a", scope: !7, file: !1, line: 11, type: !10) diff --git a/llvm/test/DebugInfo/X86/nodebug.ll b/llvm/test/DebugInfo/X86/nodebug.ll index 6062f114f80b63..b52254dea86bc5 100644 --- a/llvm/test/DebugInfo/X86/nodebug.ll +++ b/llvm/test/DebugInfo/X86/nodebug.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc < %s -filetype=obj -mtriple=x86_64-apple-darwin | llvm-dwarfdump -v - | FileCheck %s diff --git a/llvm/test/DebugInfo/X86/nodebug_with_debug_loc.ll b/llvm/test/DebugInfo/X86/nodebug_with_debug_loc.ll index 417922c0d9a9bb..a01e6c06398aa7 100644 --- a/llvm/test/DebugInfo/X86/nodebug_with_debug_loc.ll +++ b/llvm/test/DebugInfo/X86/nodebug_with_debug_loc.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -mtriple=i386-linux-gnu -filetype=obj -relocation-model=pic %s -o /dev/null diff --git a/llvm/test/DebugInfo/X86/parameters.ll b/llvm/test/DebugInfo/X86/parameters.ll index f0a970471bb402..5f4edd5b963dee 100644 --- a/llvm/test/DebugInfo/X86/parameters.ll +++ b/llvm/test/DebugInfo/X86/parameters.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O0 -filetype=obj %s -o - | llvm-dwarfdump -v -debug-info - | FileCheck %s diff --git a/llvm/test/DebugInfo/X86/rematerialize.ll b/llvm/test/DebugInfo/X86/rematerialize.ll index 4b646be2481e4e..f3e7e0a2086ace 100644 --- a/llvm/test/DebugInfo/X86/rematerialize.ll +++ b/llvm/test/DebugInfo/X86/rematerialize.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -O2 -filetype=obj -mtriple=x86_64-unknown-linux-gnu < %s \ ; RUN: | llvm-dwarfdump -debug-line - | FileCheck %s ; diff --git a/llvm/test/DebugInfo/X86/string-offsets-multiple-cus.ll b/llvm/test/DebugInfo/X86/string-offsets-multiple-cus.ll index e1042a95ddeee9..4e8dfc2ada7472 100644 --- a/llvm/test/DebugInfo/X86/string-offsets-multiple-cus.ll +++ b/llvm/test/DebugInfo/X86/string-offsets-multiple-cus.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -filetype=obj < %s | llvm-dwarfdump -v - | \ ; RUN: FileCheck --check-prefix=DEFAULT --check-prefix=BOTH %s ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -filetype=obj -generate-type-units < %s | \ diff --git a/llvm/test/DebugInfo/X86/string-offsets-table-order.ll b/llvm/test/DebugInfo/X86/string-offsets-table-order.ll index ffa8550be5409a..ca159eea615f68 100644 --- a/llvm/test/DebugInfo/X86/string-offsets-table-order.ll +++ b/llvm/test/DebugInfo/X86/string-offsets-table-order.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -split-dwarf-file=foo.dwo -filetype=obj < %s \ ; RUN: | llvm-dwarfdump -v - | FileCheck %s diff --git a/llvm/test/DebugInfo/X86/string-offsets-table.ll b/llvm/test/DebugInfo/X86/string-offsets-table.ll index 21016bd286b613..e1c914a1946b8b 100644 --- a/llvm/test/DebugInfo/X86/string-offsets-table.ll +++ b/llvm/test/DebugInfo/X86/string-offsets-table.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -filetype=obj < %s | llvm-dwarfdump -v - \ ; RUN: | FileCheck --check-prefix=MONOLITHIC %s ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -split-dwarf-file=foo.dwo -filetype=obj < %s \ diff --git a/llvm/test/DebugInfo/X86/template.ll b/llvm/test/DebugInfo/X86/template.ll index 769e2541cd5ff8..72566154d22589 100644 --- a/llvm/test/DebugInfo/X86/template.ll +++ b/llvm/test/DebugInfo/X86/template.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -mtriple=x86_64-linux -O0 -filetype=obj < %s | llvm-dwarfdump -v -debug-info - | FileCheck %s ; RUN: llc -mtriple=x86_64-linux -O0 -filetype=obj < %s | not llvm-dwarfdump -verify - | FileCheck %s --check-prefix VERIFY diff --git a/llvm/test/DebugInfo/X86/tu-to-non-named-type.ll b/llvm/test/DebugInfo/X86/tu-to-non-named-type.ll index 19d9976449bc40..883bab142ec7e3 100644 --- a/llvm/test/DebugInfo/X86/tu-to-non-named-type.ll +++ b/llvm/test/DebugInfo/X86/tu-to-non-named-type.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -filetype=obj -O0 -generate-type-units -mtriple=x86_64-unknown-linux-gnu < %s \ ; RUN: | llvm-dwarfdump -debug-info -debug-types - | FileCheck %s diff --git a/llvm/test/DebugInfo/X86/type_units_with_addresses.ll b/llvm/test/DebugInfo/X86/type_units_with_addresses.ll index de563ee2a395bd..0f33ee2209f64c 100644 --- a/llvm/test/DebugInfo/X86/type_units_with_addresses.ll +++ b/llvm/test/DebugInfo/X86/type_units_with_addresses.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -split-dwarf-file=foo.dwo -filetype=obj -O0 -generate-type-units -mtriple=x86_64-unknown-linux-gnu < %s \ ; RUN: | llvm-dwarfdump -v - | FileCheck %s diff --git a/llvm/test/Object/AArch64/yaml2obj-elf-aarch64-rel.yaml b/llvm/test/Object/AArch64/yaml2obj-elf-aarch64-rel.yaml index 40eec94971306a..17b638da0eb8c3 100644 --- a/llvm/test/Object/AArch64/yaml2obj-elf-aarch64-rel.yaml +++ b/llvm/test/Object/AArch64/yaml2obj-elf-aarch64-rel.yaml @@ -5,7 +5,6 @@ # CHECK-NEXT: Type: SHT_RELA # CHECK-NEXT: Link: .symtab # CHECK-NEXT: AddressAlign: 0x0000000000000008 -# CHECK-NEXT: EntSize: 0x0000000000000018 # CHECK-NEXT: Info: .text # CHECK-NEXT: Relocations: # CHECK-NEXT: - Symbol: main diff --git a/llvm/test/Object/obj2yaml.test b/llvm/test/Object/obj2yaml.test index 748e713d1a95a5..a5f008ffd238a1 100644 --- a/llvm/test/Object/obj2yaml.test +++ b/llvm/test/Object/obj2yaml.test @@ -362,7 +362,6 @@ # ELF-MIPSEL-NEXT: Type: SHT_REL # ELF-MIPSEL-NEXT: Link: .symtab # ELF-MIPSEL-NEXT: AddressAlign: 0x0000000000000004 -# ELF-MIPSEL-NEXT: EntSize: 0x0000000000000008 # ELF-MIPSEL-NEXT: Info: .text # ELF-MIPSEL-NEXT: Relocations: # ELF-MIPSEL-NEXT: - Symbol: _gp_disp @@ -483,7 +482,6 @@ # ELF-MIPS64EL-NEXT: Type: SHT_RELA # ELF-MIPS64EL-NEXT: Link: .symtab # ELF-MIPS64EL-NEXT: AddressAlign: 0x0000000000000008 -# ELF-MIPS64EL-NEXT: EntSize: 0x0000000000000018 # ELF-MIPS64EL-NEXT: Info: .data # ELF-MIPS64EL-NEXT: Relocations: # ELF-MIPS64EL-NEXT: - Symbol: zed @@ -552,7 +550,6 @@ # ELF-X86-64-NEXT: Address: 0x0000000000000038 # ELF-X86-64-NEXT: Link: .symtab # ELF-X86-64-NEXT: AddressAlign: 0x0000000000000008 -# ELF-X86-64-NEXT: EntSize: 0x0000000000000018 # ELF-X86-64-NEXT: Info: .text # ELF-X86-64-NEXT: Relocations: # ELF-X86-64-NEXT: - Offset: 0x000000000000000D diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll index 39a8c3c9232764..b8fa5c7f49993c 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll @@ -2678,6 +2678,228 @@ define <32 x i16> @avx512_psllv_w_512_undef(<32 x i16> %v) { ; Vector Masked Shift Amounts ; +define <8 x i16> @sse2_psra_w_128_masked(<8 x i16> %v, <8 x i16> %a) { +; CHECK-LABEL: @sse2_psra_w_128_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and <8 x i16> [[A:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> [[V:%.*]], <8 x i16> [[TMP1]]) +; CHECK-NEXT: ret <8 x i16> [[TMP2]] +; + %1 = and <8 x i16> %a, + %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %1) + ret <8 x i16> %2 +} + +define <8 x i32> @avx2_psra_d_256_masked(<8 x i32> %v, <4 x i32> %a) { +; CHECK-LABEL: @avx2_psra_d_256_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[A:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> [[V:%.*]], <4 x i32> [[TMP1]]) +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = and <4 x i32> %a, + %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %1) + ret <8 x i32> %2 +} + +define <8 x i64> @avx512_psra_q_512_masked(<8 x i64> %v, <2 x i64> %a) { +; CHECK-LABEL: @avx512_psra_q_512_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i64> [[A:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[V:%.*]], <2 x i64> [[TMP1]]) +; CHECK-NEXT: ret <8 x i64> [[TMP2]] +; + %1 = and <2 x i64> %a, + %2 = tail call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %v, <2 x i64> %1) + ret <8 x i64> %2 +} + +define <4 x i32> @sse2_psrl_d_128_masked(<4 x i32> %v, <4 x i32> %a) { +; CHECK-LABEL: @sse2_psrl_d_128_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[A:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> [[V:%.*]], <4 x i32> [[TMP1]]) +; CHECK-NEXT: ret <4 x i32> [[TMP2]] +; + %1 = and <4 x i32> %a, + %2 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %1) + ret <4 x i32> %2 +} + +define <4 x i64> @avx2_psrl_q_256_masked(<4 x i64> %v, <2 x i64> %a) { +; CHECK-LABEL: @avx2_psrl_q_256_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i64> [[A:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> [[V:%.*]], <2 x i64> [[TMP1]]) +; CHECK-NEXT: ret <4 x i64> [[TMP2]] +; + %1 = and <2 x i64> %a, + %2 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %1) + ret <4 x i64> %2 +} + +define <32 x i16> @avx512_psrl_w_512_masked(<32 x i16> %v, <8 x i16> %a) { +; CHECK-LABEL: @avx512_psrl_w_512_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and <8 x i16> [[A:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> [[V:%.*]], <8 x i16> [[TMP1]]) +; CHECK-NEXT: ret <32 x i16> [[TMP2]] +; + %1 = and <8 x i16> %a, + %2 = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> %1) + ret <32 x i16> %2 +} + +define <2 x i64> @sse2_psll_q_128_masked(<2 x i64> %v, <2 x i64> %a) { +; CHECK-LABEL: @sse2_psll_q_128_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i64> [[A:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> [[V:%.*]], <2 x i64> [[TMP1]]) +; CHECK-NEXT: ret <2 x i64> [[TMP2]] +; + %1 = and <2 x i64> %a, + %2 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %1) + ret <2 x i64> %2 +} + +define <16 x i16> @avx2_psll_w_256_masked(<16 x i16> %v, <8 x i16> %a) { +; CHECK-LABEL: @avx2_psll_w_256_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and <8 x i16> [[A:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> [[V:%.*]], <8 x i16> [[TMP1]]) +; CHECK-NEXT: ret <16 x i16> [[TMP2]] +; + %1 = and <8 x i16> %a, + %2 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %1) + ret <16 x i16> %2 +} + +define <16 x i32> @avx512_psll_d_512_masked(<16 x i32> %v, <4 x i32> %a) { +; CHECK-LABEL: @avx512_psll_d_512_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[A:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[V:%.*]], <4 x i32> [[TMP1]]) +; CHECK-NEXT: ret <16 x i32> [[TMP2]] +; + %1 = and <4 x i32> %a, + %2 = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %v, <4 x i32> %1) + ret <16 x i32> %2 +} + +define <8 x i16> @sse2_psrai_w_128_masked(<8 x i16> %v, i32 %a) { +; CHECK-LABEL: @sse2_psrai_w_128_masked( +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[A:%.*]] to i16 +; CHECK-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = ashr <8 x i16> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <8 x i16> [[TMP3]] +; + %1 = and i32 %a, 15 + %2 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 %1) + ret <8 x i16> %2 +} + +define <8 x i32> @avx2_psrai_d_256_masked(<8 x i32> %v, i32 %a) { +; CHECK-LABEL: @avx2_psrai_d_256_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], 31 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = ashr <8 x i32> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = and i32 %a, 31 + %2 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 %1) + ret <8 x i32> %2 +} + +define <8 x i64> @avx512_psrai_q_512_masked(<8 x i64> %v, i32 %a) { +; CHECK-LABEL: @avx512_psrai_q_512_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], 63 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = ashr <8 x i64> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <8 x i64> [[TMP3]] +; + %1 = and i32 %a, 63 + %2 = tail call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %v, i32 %1) + ret <8 x i64> %2 +} + +define <4 x i32> @sse2_psrli_d_128_masked(<4 x i32> %v, i32 %a) { +; CHECK-LABEL: @sse2_psrli_d_128_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], 31 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <4 x i32> [[TMP2]] +; + %1 = and i32 %a, 31 + %2 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %v, i32 %1) + ret <4 x i32> %2 +} + +define <4 x i64> @avx2_psrli_q_256_masked(<4 x i64> %v, i32 %a) { +; CHECK-LABEL: @avx2_psrli_q_256_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], 63 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = lshr <4 x i64> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <4 x i64> [[TMP3]] +; + %1 = and i32 %a, 63 + %2 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %v, i32 %1) + ret <4 x i64> %2 +} + +define <32 x i16> @avx512_psrli_w_512_masked(<32 x i16> %v, i32 %a) { +; CHECK-LABEL: @avx512_psrli_w_512_masked( +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[A:%.*]] to i16 +; CHECK-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <32 x i16> undef, i16 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <32 x i16> [[DOTSPLATINSERT]], <32 x i16> undef, <32 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = lshr <32 x i16> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <32 x i16> [[TMP3]] +; + %1 = and i32 %a, 15 + %2 = tail call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %v, i32 %1) + ret <32 x i16> %2 +} + +define <2 x i64> @sse2_pslli_q_128_masked(<2 x i64> %v, i32 %a) { +; CHECK-LABEL: @sse2_pslli_q_128_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], 63 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = shl <2 x i64> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <2 x i64> [[TMP3]] +; + %1 = and i32 %a, 63 + %2 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %v, i32 %1) + ret <2 x i64> %2 +} + +define <16 x i16> @avx2_pslli_w_256_masked(<16 x i16> %v, i32 %a) { +; CHECK-LABEL: @avx2_pslli_w_256_masked( +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[A:%.*]] to i16 +; CHECK-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i16> undef, i16 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i16> [[DOTSPLATINSERT]], <16 x i16> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = shl <16 x i16> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <16 x i16> [[TMP3]] +; + %1 = and i32 %a, 15 + %2 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %v, i32 %1) + ret <16 x i16> %2 +} + +define <16 x i32> @avx512_pslli_d_512_masked(<16 x i32> %v, i32 %a) { +; CHECK-LABEL: @avx512_pslli_d_512_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], 31 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i32> undef, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = shl <16 x i32> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <16 x i32> [[TMP2]] +; + %1 = and i32 %a, 31 + %2 = tail call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %v, i32 %1) + ret <16 x i32> %2 +} + define <4 x i32> @avx2_psrav_d_128_masked(<4 x i32> %v, <4 x i32> %a) { ; CHECK-LABEL: @avx2_psrav_d_128_masked( ; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[A:%.*]], diff --git a/llvm/test/Transforms/InstCombine/vscale_alloca.ll b/llvm/test/Transforms/InstCombine/vscale_alloca.ll new file mode 100644 index 00000000000000..8cfc7b74a77fea --- /dev/null +++ b/llvm/test/Transforms/InstCombine/vscale_alloca.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -instcombine -verify < %s | FileCheck %s + +define @alloca( %z) { +; CHECK-LABEL: @alloca( +; CHECK-NEXT: ret [[Z:%.*]] +; + %a = alloca + store %z, * %a + %load = load , * %a + ret %load +} + +define void @alloca_dead_store( %z) { +; CHECK-LABEL: @alloca_dead_store( +; CHECK-NEXT: ret void +; + %a = alloca + store %z, * %a + ret void +} + +declare void @use(...) +define void @alloca_zero_byte_move_first_inst() { +; CHECK-LABEL: @alloca_zero_byte_move_first_inst( +; CHECK-NEXT: [[B:%.*]] = alloca {}, align 8 +; CHECK-NEXT: [[A:%.*]] = alloca , align 16 +; CHECK-NEXT: call void (...) @use(* nonnull [[A]]) +; CHECK-NEXT: call void (...) @use({}* nonnull [[B]]) +; CHECK-NEXT: ret void +; + %a = alloca + call void (...) @use( * %a ) + %b = alloca { } + call void (...) @use( { }* %b ) + ret void +} diff --git a/llvm/test/Transforms/InstSimplify/add-mask.ll b/llvm/test/Transforms/InstSimplify/add-mask.ll index e30a35f53127df..cd0c8719812058 100644 --- a/llvm/test/Transforms/InstSimplify/add-mask.ll +++ b/llvm/test/Transforms/InstSimplify/add-mask.ll @@ -1,9 +1,9 @@ -; NOTE: Assertions have been autogenerated by update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -instsimplify < %s | FileCheck %s -define i1 @test(i32 %a) { -; CHECK-LABEL: @test( -; CHECK: ret i1 false +define i1 @test1(i32 %a) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: ret i1 false ; %rhs = add i32 %a, -1 %and = and i32 %a, %rhs @@ -11,9 +11,20 @@ define i1 @test(i32 %a) { ret i1 %res } +define i1 @test1v(<2 x i32> %a) { +; CHECK-LABEL: @test1v( +; CHECK-NEXT: ret i1 false +; + %rhs = add <2 x i32> %a, + %and = and <2 x i32> %a, %rhs + %ext = extractelement <2 x i32> %and, i32 0 + %res = icmp eq i32 %ext, 1 + ret i1 %res +} + define i1 @test2(i32 %a) { ; CHECK-LABEL: @test2( -; CHECK: ret i1 false +; CHECK-NEXT: ret i1 false ; %rhs = add i32 %a, 1 %and = and i32 %a, %rhs @@ -21,9 +32,20 @@ define i1 @test2(i32 %a) { ret i1 %res } +define i1 @test2v(<2 x i32> %a) { +; CHECK-LABEL: @test2v( +; CHECK-NEXT: ret i1 false +; + %rhs = add <2 x i32> %a, + %and = and <2 x i32> %a, %rhs + %ext = extractelement <2 x i32> %and, i32 1 + %res = icmp eq i32 %ext, 1 + ret i1 %res +} + define i1 @test3(i32 %a) { ; CHECK-LABEL: @test3( -; CHECK: ret i1 false +; CHECK-NEXT: ret i1 false ; %rhs = add i32 %a, 7 %and = and i32 %a, %rhs @@ -31,13 +53,24 @@ define i1 @test3(i32 %a) { ret i1 %res } +define i1 @test3v(<2 x i32> %a) { +; CHECK-LABEL: @test3v( +; CHECK-NEXT: ret i1 false +; + %rhs = add <2 x i32> %a, + %and = and <2 x i32> %a, %rhs + %ext = extractelement <2 x i32> %and, i32 0 + %res = icmp eq i32 %ext, 1 + ret i1 %res +} + @B = external global i32 declare void @llvm.assume(i1) ; Known bits without a constant define i1 @test4(i32 %a) { ; CHECK-LABEL: @test4( -; CHECK: [[B:%.*]] = load i32, i32* @B +; CHECK-NEXT: [[B:%.*]] = load i32, i32* @B ; CHECK-NEXT: [[B_AND:%.*]] = and i32 [[B]], 1 ; CHECK-NEXT: [[B_CND:%.*]] = icmp eq i32 [[B_AND]], 1 ; CHECK-NEXT: call void @llvm.assume(i1 [[B_CND]]) @@ -57,8 +90,8 @@ define i1 @test4(i32 %a) { ; Negative test - even number define i1 @test5(i32 %a) { ; CHECK-LABEL: @test5( -; CHECK: [[RHS:%.*]] = add i32 %a, 2 -; CHECK-NEXT: [[AND:%.*]] = and i32 %a, [[RHS]] +; CHECK-NEXT: [[RHS:%.*]] = add i32 [[A:%.*]], 2 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[A]], [[RHS]] ; CHECK-NEXT: [[RES:%.*]] = icmp eq i32 [[AND]], 1 ; CHECK-NEXT: ret i1 [[RES]] ; @@ -68,12 +101,38 @@ define i1 @test5(i32 %a) { ret i1 %res } +define i1 @test5v(<2 x i32> %a) { +; CHECK-LABEL: @test5v( +; CHECK-NEXT: [[RHS:%.*]] = add <2 x i32> [[A:%.*]], +; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[A]], [[RHS]] +; CHECK-NEXT: [[EXT:%.*]] = extractelement <2 x i32> [[AND]], i32 1 +; CHECK-NEXT: [[RES:%.*]] = icmp eq i32 [[EXT]], 1 +; CHECK-NEXT: ret i1 [[RES]] +; + %rhs = add <2 x i32> %a, + %and = and <2 x i32> %a, %rhs + %ext = extractelement <2 x i32> %and, i32 1 + %res = icmp eq i32 %ext, 1 + ret i1 %res +} + define i1 @test6(i32 %a) { ; CHECK-LABEL: @test6( -; CHECK: ret i1 false +; CHECK-NEXT: ret i1 false ; %lhs = add i32 %a, -1 %and = and i32 %lhs, %a %res = icmp eq i32 %and, 1 ret i1 %res } + +define i1 @test6v(<2 x i32> %a) { +; CHECK-LABEL: @test6v( +; CHECK-NEXT: ret i1 false +; + %lhs = add <2 x i32> %a, + %and = and <2 x i32> %lhs, %a + %ext = extractelement <2 x i32> %and, i32 1 + %res = icmp eq i32 %ext, 1 + ret i1 %res +} diff --git a/llvm/test/Transforms/InstSimplify/bitreverse.ll b/llvm/test/Transforms/InstSimplify/bitreverse.ll index d87b68831fe5b5..2194fd45418b6b 100644 --- a/llvm/test/Transforms/InstSimplify/bitreverse.ll +++ b/llvm/test/Transforms/InstSimplify/bitreverse.ll @@ -1,31 +1,70 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -S -instsimplify | FileCheck %s declare i32 @llvm.bitreverse.i32(i32) +declare <2 x i32> @llvm.bitreverse.v2i32(<2 x i32>) -; CHECK-LABEL: @test1( -; CHECK: ret i1 false define i1 @test1(i32 %arg) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: ret i1 false +; %a = or i32 %arg, 1 %b = call i32 @llvm.bitreverse.i32(i32 %a) %res = icmp eq i32 %b, 0 ret i1 %res } -; CHECK-LABEL: @test2( -; CHECK: ret i1 false +define i1 @test1v(<2 x i32> %arg) { +; CHECK-LABEL: @test1v( +; CHECK-NEXT: ret i1 false +; + %a = or <2 x i32> %arg, + %b = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %a) + %c = extractelement <2 x i32> %b, i32 0 + %res = icmp eq i32 %c, 0 + ret i1 %res +} + define i1 @test2(i32 %arg) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: ret i1 false +; %a = or i32 %arg, 1024 %b = call i32 @llvm.bitreverse.i32(i32 %a) %res = icmp eq i32 %b, 0 ret i1 %res } -; CHECK-LABEL: @test3( -; CHECK: ret i1 false +define i1 @test2v(<2 x i32> %arg) { +; CHECK-LABEL: @test2v( +; CHECK-NEXT: ret i1 false +; + %a = or <2 x i32> %arg, + %b = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %a) + %c = extractelement <2 x i32> %b, i32 1 + %res = icmp eq i32 %c, 0 + ret i1 %res +} + define i1 @test3(i32 %arg) { +; CHECK-LABEL: @test3( +; CHECK-NEXT: ret i1 false +; %a = and i32 %arg, 1 %b = call i32 @llvm.bitreverse.i32(i32 %a) %and = and i32 %b, 1 %res = icmp eq i32 %and, 1 ret i1 %res } + +define i1 @test3v(<2 x i32> %arg) { +; CHECK-LABEL: @test3v( +; CHECK-NEXT: ret i1 false +; + %a = and <2 x i32> %arg, + %b = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %a) + %and = and <2 x i32> %b, + %ext = extractelement <2 x i32> %and, i32 0 + %res = icmp eq i32 %ext, 1 + ret i1 %res +} diff --git a/llvm/test/Transforms/InstSimplify/bswap.ll b/llvm/test/Transforms/InstSimplify/bswap.ll index 5c67aa0a7643f9..3e7616c3bdc565 100644 --- a/llvm/test/Transforms/InstSimplify/bswap.ll +++ b/llvm/test/Transforms/InstSimplify/bswap.ll @@ -1,11 +1,12 @@ -; NOTE: Assertions have been autogenerated by update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -S -instsimplify | FileCheck %s declare i16 @llvm.bswap.i16(i16) +declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) define i1 @test1(i16 %arg) { ; CHECK-LABEL: @test1( -; CHECK: ret i1 false +; CHECK-NEXT: ret i1 false ; %a = or i16 %arg, 1 %b = call i16 @llvm.bswap.i16(i16 %a) @@ -13,9 +14,20 @@ define i1 @test1(i16 %arg) { ret i1 %res } +define i1 @test1v(<2 x i16> %arg) { +; CHECK-LABEL: @test1v( +; CHECK-NEXT: ret i1 false +; + %a = or <2 x i16> %arg, + %b = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %a) + %c = extractelement <2 x i16> %b, i32 0 + %res = icmp eq i16 %c, 0 + ret i1 %res +} + define i1 @test2(i16 %arg) { ; CHECK-LABEL: @test2( -; CHECK: ret i1 false +; CHECK-NEXT: ret i1 false ; %a = or i16 %arg, 1024 %b = call i16 @llvm.bswap.i16(i16 %a) @@ -23,9 +35,20 @@ define i1 @test2(i16 %arg) { ret i1 %res } +define i1 @test2v(<2 x i16> %arg) { +; CHECK-LABEL: @test2v( +; CHECK-NEXT: ret i1 false +; + %a = or <2 x i16> %arg, + %b = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %a) + %c = extractelement <2 x i16> %b, i32 1 + %res = icmp eq i16 %c, 0 + ret i1 %res +} + define i1 @test3(i16 %arg) { ; CHECK-LABEL: @test3( -; CHECK: ret i1 false +; CHECK-NEXT: ret i1 false ; %a = and i16 %arg, 1 %b = call i16 @llvm.bswap.i16(i16 %a) @@ -34,9 +57,21 @@ define i1 @test3(i16 %arg) { ret i1 %res } +define i1 @test3v(<2 x i16> %arg) { +; CHECK-LABEL: @test3v( +; CHECK-NEXT: ret i1 false +; + %a = and <2 x i16> %arg, + %b = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %a) + %c = extractelement <2 x i16> %b, i32 0 + %and = and i16 %c, 1 + %res = icmp eq i16 %and, 1 + ret i1 %res +} + define i1 @test4(i16 %arg) { ; CHECK-LABEL: @test4( -; CHECK: ret i1 false +; CHECK-NEXT: ret i1 false ; %a = and i16 %arg, 511 %b = call i16 @llvm.bswap.i16(i16 %a) @@ -44,3 +79,15 @@ define i1 @test4(i16 %arg) { %res = icmp eq i16 %and, 1 ret i1 %res } + +define i1 @test4v(<2 x i16> %arg) { +; CHECK-LABEL: @test4v( +; CHECK-NEXT: ret i1 false +; + %a = and <2 x i16> %arg, + %b = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %a) + %and = and <2 x i16> %b, + %ext = extractelement <2 x i16> %and, i32 1 + %res = icmp eq i16 %ext, 1 + ret i1 %res +} diff --git a/llvm/test/Transforms/InstSimplify/compare.ll b/llvm/test/Transforms/InstSimplify/compare.ll index 3dfdaf2ecf2f1d..3f095cd34e3d78 100644 --- a/llvm/test/Transforms/InstSimplify/compare.ll +++ b/llvm/test/Transforms/InstSimplify/compare.ll @@ -698,6 +698,18 @@ define i1 @srem2(i16 %X, i32 %Y) { ret i1 %D } +define i1 @srem2v(<2 x i16> %X, <2 x i32> %Y) { +; CHECK-LABEL: @srem2v( +; CHECK-NEXT: ret i1 false +; + %A = zext <2 x i16> %X to <2 x i32> + %B = add nsw <2 x i32> %A, + %C = srem <2 x i32> %B, %Y + %D = extractelement <2 x i32> %C, i32 0 + %E = icmp slt i32 %D, 0 + ret i1 %E +} + define i1 @srem3(i16 %X, i32 %Y) { ; CHECK-LABEL: @srem3( ; CHECK-NEXT: ret i1 false @@ -710,6 +722,19 @@ define i1 @srem3(i16 %X, i32 %Y) { ret i1 %E } +define i1 @srem3v(<2 x i16> %X, <2 x i32> %Y) { +; CHECK-LABEL: @srem3v( +; CHECK-NEXT: ret i1 false +; + %A = zext <2 x i16> %X to <2 x i32> + %B = or <2 x i32> , %A + %C = sub nsw <2 x i32> , %B + %D = srem <2 x i32> %C, %Y + %E = extractelement <2 x i32> %C, i32 1 + %F = icmp slt i32 %E, 0 + ret i1 %F +} + define i1 @udiv2(i32 %Z) { ; CHECK-LABEL: @udiv2( ; CHECK-NEXT: ret i1 true @@ -795,33 +820,55 @@ define i1 @udiv8(i32 %X, i32 %Y) { ret i1 %C } +; Square of a non-zero number is non-zero if there is no overflow. define i1 @mul1(i32 %X) { ; CHECK-LABEL: @mul1( ; CHECK-NEXT: ret i1 false ; -; Square of a non-zero number is non-zero if there is no overflow. %Y = or i32 %X, 1 %M = mul nuw i32 %Y, %Y %C = icmp eq i32 %M, 0 ret i1 %C } +define i1 @mul1v(<2 x i32> %X) { +; CHECK-LABEL: @mul1v( +; CHECK-NEXT: ret i1 false +; + %Y = or <2 x i32> %X, + %M = mul nuw <2 x i32> %Y, %Y + %E = extractelement <2 x i32> %M, i32 0 + %C = icmp eq i32 %E, 0 + ret i1 %C +} + +; Square of a non-zero number is positive if there is no signed overflow. define i1 @mul2(i32 %X) { ; CHECK-LABEL: @mul2( ; CHECK-NEXT: ret i1 true ; -; Square of a non-zero number is positive if there is no signed overflow. %Y = or i32 %X, 1 %M = mul nsw i32 %Y, %Y %C = icmp sgt i32 %M, 0 ret i1 %C } +define i1 @mul2v(<2 x i32> %X) { +; CHECK-LABEL: @mul2v( +; CHECK-NEXT: ret i1 true +; + %Y = or <2 x i32> %X, + %M = mul nsw <2 x i32> %Y, %Y + %E = extractelement <2 x i32> %M, i32 1 + %C = icmp sgt i32 %E, 0 + ret i1 %C +} + +; Product of non-negative numbers is non-negative if there is no signed overflow. define i1 @mul3(i32 %X, i32 %Y) { ; CHECK-LABEL: @mul3( ; CHECK-NEXT: ret i1 true ; -; Product of non-negative numbers is non-negative if there is no signed overflow. %XX = mul nsw i32 %X, %X %YY = mul nsw i32 %Y, %Y %M = mul nsw i32 %XX, %YY @@ -829,6 +876,17 @@ define i1 @mul3(i32 %X, i32 %Y) { ret i1 %C } +define <2 x i1> @mul3v(<2 x i32> %X, <2 x i32> %Y) { +; CHECK-LABEL: @mul3v( +; CHECK-NEXT: ret <2 x i1> +; + %XX = mul nsw <2 x i32> %X, %X + %YY = mul nsw <2 x i32> %Y, %Y + %M = mul nsw <2 x i32> %XX, %YY + %C = icmp sge <2 x i32> %M, zeroinitializer + ret <2 x i1> %C +} + define <2 x i1> @vectorselect1(<2 x i1> %cond) { ; CHECK-LABEL: @vectorselect1( ; CHECK-NEXT: ret <2 x i1> [[COND:%.*]] @@ -1258,7 +1316,20 @@ define i1 @icmp_known_bits(i4 %x, i4 %y) { %add = add i4 %or1, %or2 %cmp = icmp eq i4 %add, 0 ret i1 %cmp +} +define i1 @icmp_known_bits_vec(<2 x i4> %x, <2 x i4> %y) { +; CHECK-LABEL: @icmp_known_bits_vec( +; CHECK-NEXT: ret i1 false +; + %and1 = and <2 x i4> %y, + %and2 = and <2 x i4> %x, + %or1 = or <2 x i4> %and1, + %or2 = or <2 x i4> %and2, + %add = add <2 x i4> %or1, %or2 + %ext = extractelement <2 x i4> %add,i32 0 + %cmp = icmp eq i4 %ext, 0 + ret i1 %cmp } define i1 @icmp_shl_nuw_1(i64 %a) { diff --git a/llvm/test/Transforms/InstSimplify/or.ll b/llvm/test/Transforms/InstSimplify/or.ll index 20f67769eed7f4..7369cddf2fbe9c 100644 --- a/llvm/test/Transforms/InstSimplify/or.ll +++ b/llvm/test/Transforms/InstSimplify/or.ll @@ -3,7 +3,7 @@ define i32 @test1(i32 %A) { ; CHECK-LABEL: @test1( -; CHECK-NEXT: ret i32 %A +; CHECK-NEXT: ret i32 [[A:%.*]] ; %B = or i32 %A, 0 ret i32 %B @@ -27,7 +27,7 @@ define <3 x i8> @all_ones_vec_with_undef_elt(<3 x i8> %A) { define i1 @test3(i1 %A) { ; CHECK-LABEL: @test3( -; CHECK-NEXT: ret i1 %A +; CHECK-NEXT: ret i1 [[A:%.*]] ; %B = or i1 %A, false ret i1 %B @@ -43,7 +43,7 @@ define i1 @test4(i1 %A) { define i1 @test5(i1 %A) { ; CHECK-LABEL: @test5( -; CHECK-NEXT: ret i1 %A +; CHECK-NEXT: ret i1 [[A:%.*]] ; %B = or i1 %A, %A ret i1 %B @@ -51,7 +51,7 @@ define i1 @test5(i1 %A) { define i32 @test6(i32 %A) { ; CHECK-LABEL: @test6( -; CHECK-NEXT: ret i32 %A +; CHECK-NEXT: ret i32 [[A:%.*]] ; %B = or i32 %A, %A ret i32 %B @@ -87,110 +87,122 @@ define i8 @test9(i8 %A, i8 %B) { ret i8 %E } +; (X & C1) | C2 --> (X | C2) & (C1|C2) define i8 @test10(i8 %A) { ; CHECK-LABEL: @test10( ; CHECK-NEXT: ret i8 -2 ; %B = or i8 %A, 1 %C = and i8 %B, -2 - ; (X & C1) | C2 --> (X | C2) & (C1|C2) %D = or i8 %C, -2 ret i8 %D } +; (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2) define i8 @test11(i8 %A) { ; CHECK-LABEL: @test11( ; CHECK-NEXT: ret i8 -1 ; %B = or i8 %A, -2 %C = xor i8 %B, 13 - ; (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2) + %D = or i8 %C, 1 + %E = xor i8 %D, 12 + ret i8 %E +} + +define i8 @test11v(<2 x i8> %A) { +; CHECK-LABEL: @test11v( +; CHECK-NEXT: ret i8 -1 +; + %B = or <2 x i8> %A, + %CV = xor <2 x i8> %B, + %C = extractelement <2 x i8> %CV, i32 0 %D = or i8 %C, 1 %E = xor i8 %D, 12 ret i8 %E } ; Test the case where integer BitWidth <= 64 && BitWidth % 2 != 0. +; If we have: ((V + N) & C1) | (V & C2) +; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0 +; replace with V+N. define i39 @test1_apint(i39 %V, i39 %M) { ; CHECK-LABEL: @test1_apint( -; CHECK: [[N:%.*]] = and i39 %M, -274877906944 -; CHECK-NEXT: [[A:%.*]] = add i39 %V, [[N]] +; CHECK-NEXT: [[N:%.*]] = and i39 [[M:%.*]], -274877906944 +; CHECK-NEXT: [[A:%.*]] = add i39 [[V:%.*]], [[N]] ; CHECK-NEXT: ret i39 [[A]] ; - ;; If we have: ((V + N) & C1) | (V & C2) - ;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0 - ;; replace with V+N. - %C1 = xor i39 274877906943, -1 ;; C2 = 274877906943 - %N = and i39 %M, 274877906944 - %A = add i39 %V, %N - %B = and i39 %A, %C1 - %D = and i39 %V, 274877906943 - %R = or i39 %B, %D - ret i39 %R + %C1 = xor i39 274877906943, -1 ;; C2 = 274877906943 + %N = and i39 %M, 274877906944 + %A = add i39 %V, %N + %B = and i39 %A, %C1 + %D = and i39 %V, 274877906943 + %R = or i39 %B, %D + ret i39 %R } define i7 @test2_apint(i7 %X) { ; CHECK-LABEL: @test2_apint( -; CHECK: ret i7 %X +; CHECK-NEXT: ret i7 [[X:%.*]] ; - %Y = or i7 %X, 0 - ret i7 %Y + %Y = or i7 %X, 0 + ret i7 %Y } define i17 @test3_apint(i17 %X) { ; CHECK-LABEL: @test3_apint( -; CHECK: ret i17 -1 +; CHECK-NEXT: ret i17 -1 ; - %Y = or i17 %X, -1 - ret i17 %Y + %Y = or i17 %X, -1 + ret i17 %Y } ; Test the case where Integer BitWidth > 64 && BitWidth <= 1024. +; If we have: ((V + N) & C1) | (V & C2) +; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0 +; replace with V+N. define i399 @test4_apint(i399 %V, i399 %M) { ; CHECK-LABEL: @test4_apint( -; CHECK: [[N:%.*]] = and i399 %M, 18446742974197923840 -; CHECK-NEXT: [[A:%.*]] = add i399 %V, [[N]] +; CHECK-NEXT: [[N:%.*]] = and i399 [[M:%.*]], 18446742974197923840 +; CHECK-NEXT: [[A:%.*]] = add i399 [[V:%.*]], [[N]] ; CHECK-NEXT: ret i399 [[A]] ; - ;; If we have: ((V + N) & C1) | (V & C2) - ;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0 - ;; replace with V+N. - %C1 = xor i399 274877906943, -1 ;; C2 = 274877906943 - %N = and i399 %M, 18446742974197923840 - %A = add i399 %V, %N - %B = and i399 %A, %C1 - %D = and i399 %V, 274877906943 - %R = or i399 %D, %B - ret i399 %R + %C1 = xor i399 274877906943, -1 ;; C2 = 274877906943 + %N = and i399 %M, 18446742974197923840 + %A = add i399 %V, %N + %B = and i399 %A, %C1 + %D = and i399 %V, 274877906943 + %R = or i399 %D, %B + ret i399 %R } define i777 @test5_apint(i777 %X) { ; CHECK-LABEL: @test5_apint( -; CHECK: ret i777 %X +; CHECK-NEXT: ret i777 [[X:%.*]] ; - %Y = or i777 %X, 0 - ret i777 %Y + %Y = or i777 %X, 0 + ret i777 %Y } define i117 @test6_apint(i117 %X) { ; CHECK-LABEL: @test6_apint( -; CHECK: ret i117 -1 +; CHECK-NEXT: ret i117 -1 ; - %Y = or i117 %X, -1 - ret i117 %Y + %Y = or i117 %X, -1 + ret i117 %Y } ; Test the case where integer BitWidth <= 64 && BitWidth % 2 != 0. ; Vector version of test1_apint with the add commuted +; If we have: ((V + N) & C1) | (V & C2) +; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0 +; replace with V+N. define <2 x i39> @test7_apint(<2 x i39> %V, <2 x i39> %M) { ; CHECK-LABEL: @test7_apint( ; CHECK-NEXT: [[N:%.*]] = and <2 x i39> [[M:%.*]], ; CHECK-NEXT: [[A:%.*]] = add <2 x i39> [[N]], [[V:%.*]] ; CHECK-NEXT: ret <2 x i39> [[A]] ; - ;; If we have: ((V + N) & C1) | (V & C2) - ;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0 - ;; replace with V+N. %C1 = xor <2 x i39> , ;; C2 = 274877906943 %N = and <2 x i39> %M, %A = add <2 x i39> %N, %V @@ -202,15 +214,15 @@ define <2 x i39> @test7_apint(<2 x i39> %V, <2 x i39> %M) { ; Test the case where Integer BitWidth > 64 && BitWidth <= 1024. ; Vector version of test4_apint with the add and the or commuted +; If we have: ((V + N) & C1) | (V & C2) +; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0 +; replace with V+N. define <2 x i399> @test8_apint(<2 x i399> %V, <2 x i399> %M) { ; CHECK-LABEL: @test8_apint( ; CHECK-NEXT: [[N:%.*]] = and <2 x i399> [[M:%.*]], ; CHECK-NEXT: [[A:%.*]] = add <2 x i399> [[N]], [[V:%.*]] ; CHECK-NEXT: ret <2 x i399> [[A]] ; - ;; If we have: ((V + N) & C1) | (V & C2) - ;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0 - ;; replace with V+N. %C1 = xor <2 x i399> , ;; C2 = 274877906943 %N = and <2 x i399> %M, %A = add <2 x i399> %N, %V diff --git a/llvm/test/Transforms/JumpThreading/select.ll b/llvm/test/Transforms/JumpThreading/select.ll index 08598f84c1fd15..4309a1babd7365 100644 --- a/llvm/test/Transforms/JumpThreading/select.ll +++ b/llvm/test/Transforms/JumpThreading/select.ll @@ -441,3 +441,31 @@ sw.default: ; preds = %if.end, %sw.bb9 ; CHECK: i32 2, label [[DEST1]] ; CHECK: i32 4, label [[DEST2]] } + +; FIXME: This is an invalid transform. If %b is false and %x is poison, +; then the select produces poison (the result of the program is poison). +; But with this transform, we may be branching on poison, and that is UB. + +define i32 @TryToUnfoldSelectInCurrBB(i1 %b, i1 %ui, i32 %s, i1 %x) { +; CHECK-LABEL: @TryToUnfoldSelectInCurrBB( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[B:%.*]], label [[IF_END_THREAD:%.*]], label [[IF_END:%.*]] +; CHECK: if.end: +; CHECK-NEXT: br i1 [[X:%.*]], label [[TMP0:%.*]], label [[IF_END_THREAD]] +; CHECK: 0: +; CHECK-NEXT: br label [[IF_END_THREAD]] +; CHECK: if.end.thread: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[S:%.*]], [[TMP0]] ], [ 42, [[IF_END]] ], [ 42, [[ENTRY:%.*]] ] +; CHECK-NEXT: ret i32 [[TMP1]] +; +entry: + br i1 %b, label %if.end, label %if.else + +if.else: + br label %if.end + +if.end: + %v = phi i1 [ %x, %if.else ], [ false, %entry ] + %v1 = select i1 %v, i32 %s, i32 42 + ret i32 %v1 +} diff --git a/llvm/test/Transforms/LoopUnroll/PowerPC/p8-unrolling-legalize-vectors.ll b/llvm/test/Transforms/LoopUnroll/PowerPC/p8-unrolling-legalize-vectors.ll index 27998230abe4ad..aa0ead18ba3cd6 100644 --- a/llvm/test/Transforms/LoopUnroll/PowerPC/p8-unrolling-legalize-vectors.ll +++ b/llvm/test/Transforms/LoopUnroll/PowerPC/p8-unrolling-legalize-vectors.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -S -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -loop-unroll | FileCheck %s ; RUN: opt < %s -S -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -loop-unroll | FileCheck %s @@ -6,6 +7,195 @@ target triple = "powerpc64le-unknown-linux-gnu" ; Function Attrs: norecurse nounwind define i8* @f(i8* returned %s, i32 zeroext %x, i32 signext %k) local_unnamed_addr #0 { +; CHECK-LABEL: @f( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[K:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[K]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[K]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967280 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> undef, i32 [[X:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[N_VEC]], -16 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[XTRAITER1:%.*]] = and i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP1]], 1 +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]] +; CHECK: vector.ph.new: +; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[TMP2]], [[XTRAITER1]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND12:%.*]] = phi <16 x i32> [ , [[VECTOR_PH_NEW]] ], [ [[VEC_IND_NEXT13_1:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[NITER:%.*]] = phi i64 [ [[UNROLL_ITER]], [[VECTOR_PH_NEW]] ], [ [[NITER_NSUB_1:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = shl <16 x i32> , [[VEC_IND12]] +; CHECK-NEXT: [[TMP5:%.*]] = and <16 x i32> [[TMP4]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <16 x i32> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> , <16 x i8> +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[S:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP7]], <16 x i8>* [[TMP9]], align 1 +; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add nuw nsw i64 [[INDEX]], 16 +; CHECK-NEXT: [[VEC_IND_NEXT13:%.*]] = add <16 x i32> [[VEC_IND12]], +; CHECK-NEXT: [[NITER_NSUB:%.*]] = sub i64 [[NITER]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = shl <16 x i32> , [[VEC_IND_NEXT13]] +; CHECK-NEXT: [[TMP11:%.*]] = and <16 x i32> [[TMP10]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq <16 x i32> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i8> , <16 x i8> +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[S]], i64 [[INDEX_NEXT]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP13]], <16 x i8>* [[TMP15]], align 1 +; CHECK-NEXT: [[INDEX_NEXT_1]] = add i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: [[VEC_IND_NEXT13_1]] = add <16 x i32> [[VEC_IND_NEXT13]], +; CHECK-NEXT: [[NITER_NSUB_1]] = sub i64 [[NITER_NSUB]], 1 +; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NSUB_1]], 0 +; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label [[MIDDLE_BLOCK_UNR_LCSSA_LOOPEXIT:%.*]], label [[VECTOR_BODY]] +; CHECK: middle.block.unr-lcssa.loopexit: +; CHECK-NEXT: [[INDEX_UNR_PH:%.*]] = phi i64 [ [[INDEX_NEXT_1]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND12_UNR_PH:%.*]] = phi <16 x i32> [ [[VEC_IND_NEXT13_1]], [[VECTOR_BODY]] ] +; CHECK-NEXT: br label [[MIDDLE_BLOCK_UNR_LCSSA]] +; CHECK: middle.block.unr-lcssa: +; CHECK-NEXT: [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_UNR_PH]], [[MIDDLE_BLOCK_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[VEC_IND12_UNR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND12_UNR_PH]], [[MIDDLE_BLOCK_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[LCMP_MOD2:%.*]] = icmp ne i64 [[XTRAITER1]], 0 +; CHECK-NEXT: br i1 [[LCMP_MOD2]], label [[VECTOR_BODY_EPIL_PREHEADER:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.body.epil.preheader: +; CHECK-NEXT: br label [[VECTOR_BODY_EPIL:%.*]] +; CHECK: vector.body.epil: +; CHECK-NEXT: [[INDEX_EPIL:%.*]] = phi i64 [ [[INDEX_UNR]], [[VECTOR_BODY_EPIL_PREHEADER]] ] +; CHECK-NEXT: [[VEC_IND12_EPIL:%.*]] = phi <16 x i32> [ [[VEC_IND12_UNR]], [[VECTOR_BODY_EPIL_PREHEADER]] ] +; CHECK-NEXT: [[TMP16:%.*]] = shl <16 x i32> , [[VEC_IND12_EPIL]] +; CHECK-NEXT: [[TMP17:%.*]] = and <16 x i32> [[TMP16]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq <16 x i32> [[TMP17]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i8> , <16 x i8> +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, i8* [[S]], i64 [[INDEX_EPIL]] +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP20]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP19]], <16 x i8>* [[TMP21]], align 1 +; CHECK-NEXT: [[INDEX_NEXT_EPIL:%.*]] = add i64 [[INDEX_EPIL]], 16 +; CHECK-NEXT: [[VEC_IND_NEXT13_EPIL:%.*]] = add <16 x i32> [[VEC_IND12_EPIL]], +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT_EPIL]], [[N_VEC]] +; CHECK-NEXT: br label [[MIDDLE_BLOCK_EPILOG_LCSSA:%.*]] +; CHECK: middle.block.epilog-lcssa: +; CHECK-NEXT: br label [[MIDDLE_BLOCK]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP23:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[INDVARS_IV_PH]] +; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[WIDE_TRIP_COUNT]], -1 +; CHECK-NEXT: [[TMP25:%.*]] = sub i64 [[TMP24]], [[INDVARS_IV_PH]] +; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP23]], 7 +; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0 +; CHECK-NEXT: br i1 [[LCMP_MOD]], label [[FOR_BODY_PROL_PREHEADER:%.*]], label [[FOR_BODY_PROL_LOOPEXIT:%.*]] +; CHECK: for.body.prol.preheader: +; CHECK-NEXT: br label [[FOR_BODY_PROL:%.*]] +; CHECK: for.body.prol: +; CHECK-NEXT: [[INDVARS_IV_PROL:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PROL:%.*]], [[FOR_BODY_PROL]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PROL_PREHEADER]] ] +; CHECK-NEXT: [[PROL_ITER:%.*]] = phi i64 [ [[XTRAITER]], [[FOR_BODY_PROL_PREHEADER]] ], [ [[PROL_ITER_SUB:%.*]], [[FOR_BODY_PROL]] ] +; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[INDVARS_IV_PROL]] to i32 +; CHECK-NEXT: [[SHL_PROL:%.*]] = shl i32 1, [[TMP26]] +; CHECK-NEXT: [[AND_PROL:%.*]] = and i32 [[SHL_PROL]], [[X]] +; CHECK-NEXT: [[TOBOOL_PROL:%.*]] = icmp eq i32 [[AND_PROL]], 0 +; CHECK-NEXT: [[CONV_PROL:%.*]] = select i1 [[TOBOOL_PROL]], i8 48, i8 49 +; CHECK-NEXT: [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds i8, i8* [[S]], i64 [[INDVARS_IV_PROL]] +; CHECK-NEXT: store i8 [[CONV_PROL]], i8* [[ARRAYIDX_PROL]], align 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT_PROL]] = add nuw nsw i64 [[INDVARS_IV_PROL]], 1 +; CHECK-NEXT: [[EXITCOND_PROL:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_PROL]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: [[PROL_ITER_SUB]] = sub i64 [[PROL_ITER]], 1 +; CHECK-NEXT: [[PROL_ITER_CMP:%.*]] = icmp ne i64 [[PROL_ITER_SUB]], 0 +; CHECK-NEXT: br i1 [[PROL_ITER_CMP]], label [[FOR_BODY_PROL]], label [[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA:%.*]], !llvm.loop !0 +; CHECK: for.body.prol.loopexit.unr-lcssa: +; CHECK-NEXT: [[INDVARS_IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PROL]], [[FOR_BODY_PROL]] ] +; CHECK-NEXT: br label [[FOR_BODY_PROL_LOOPEXIT]] +; CHECK: for.body.prol.loopexit: +; CHECK-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_UNR_PH]], [[FOR_BODY_PROL_LOOPEXIT_UNR_LCSSA]] ] +; CHECK-NEXT: [[TMP27:%.*]] = icmp ult i64 [[TMP25]], 7 +; CHECK-NEXT: br i1 [[TMP27]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY_PREHEADER_NEW:%.*]] +; CHECK: for.body.preheader.new: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], [[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 1, [[TMP28]] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[SHL]], [[X]] +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[AND]], 0 +; CHECK-NEXT: [[CONV:%.*]] = select i1 [[TOBOOL]], i8 48, i8 49 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[S]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i8 [[CONV]], i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[TMP29:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[SHL_1:%.*]] = shl i32 1, [[TMP29]] +; CHECK-NEXT: [[AND_1:%.*]] = and i32 [[SHL_1]], [[X]] +; CHECK-NEXT: [[TOBOOL_1:%.*]] = icmp eq i32 [[AND_1]], 0 +; CHECK-NEXT: [[CONV_1:%.*]] = select i1 [[TOBOOL_1]], i8 48, i8 49 +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, i8* [[S]], i64 [[INDVARS_IV_NEXT]] +; CHECK-NEXT: store i8 [[CONV_1]], i8* [[ARRAYIDX_1]], align 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT]], 1 +; CHECK-NEXT: [[TMP30:%.*]] = trunc i64 [[INDVARS_IV_NEXT_1]] to i32 +; CHECK-NEXT: [[SHL_2:%.*]] = shl i32 1, [[TMP30]] +; CHECK-NEXT: [[AND_2:%.*]] = and i32 [[SHL_2]], [[X]] +; CHECK-NEXT: [[TOBOOL_2:%.*]] = icmp eq i32 [[AND_2]], 0 +; CHECK-NEXT: [[CONV_2:%.*]] = select i1 [[TOBOOL_2]], i8 48, i8 49 +; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, i8* [[S]], i64 [[INDVARS_IV_NEXT_1]] +; CHECK-NEXT: store i8 [[CONV_2]], i8* [[ARRAYIDX_2]], align 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_1]], 1 +; CHECK-NEXT: [[TMP31:%.*]] = trunc i64 [[INDVARS_IV_NEXT_2]] to i32 +; CHECK-NEXT: [[SHL_3:%.*]] = shl i32 1, [[TMP31]] +; CHECK-NEXT: [[AND_3:%.*]] = and i32 [[SHL_3]], [[X]] +; CHECK-NEXT: [[TOBOOL_3:%.*]] = icmp eq i32 [[AND_3]], 0 +; CHECK-NEXT: [[CONV_3:%.*]] = select i1 [[TOBOOL_3]], i8 48, i8 49 +; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, i8* [[S]], i64 [[INDVARS_IV_NEXT_2]] +; CHECK-NEXT: store i8 [[CONV_3]], i8* [[ARRAYIDX_3]], align 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_2]], 1 +; CHECK-NEXT: [[TMP32:%.*]] = trunc i64 [[INDVARS_IV_NEXT_3]] to i32 +; CHECK-NEXT: [[SHL_4:%.*]] = shl i32 1, [[TMP32]] +; CHECK-NEXT: [[AND_4:%.*]] = and i32 [[SHL_4]], [[X]] +; CHECK-NEXT: [[TOBOOL_4:%.*]] = icmp eq i32 [[AND_4]], 0 +; CHECK-NEXT: [[CONV_4:%.*]] = select i1 [[TOBOOL_4]], i8 48, i8 49 +; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i8, i8* [[S]], i64 [[INDVARS_IV_NEXT_3]] +; CHECK-NEXT: store i8 [[CONV_4]], i8* [[ARRAYIDX_4]], align 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT_4:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_3]], 1 +; CHECK-NEXT: [[TMP33:%.*]] = trunc i64 [[INDVARS_IV_NEXT_4]] to i32 +; CHECK-NEXT: [[SHL_5:%.*]] = shl i32 1, [[TMP33]] +; CHECK-NEXT: [[AND_5:%.*]] = and i32 [[SHL_5]], [[X]] +; CHECK-NEXT: [[TOBOOL_5:%.*]] = icmp eq i32 [[AND_5]], 0 +; CHECK-NEXT: [[CONV_5:%.*]] = select i1 [[TOBOOL_5]], i8 48, i8 49 +; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i8, i8* [[S]], i64 [[INDVARS_IV_NEXT_4]] +; CHECK-NEXT: store i8 [[CONV_5]], i8* [[ARRAYIDX_5]], align 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT_5:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_4]], 1 +; CHECK-NEXT: [[TMP34:%.*]] = trunc i64 [[INDVARS_IV_NEXT_5]] to i32 +; CHECK-NEXT: [[SHL_6:%.*]] = shl i32 1, [[TMP34]] +; CHECK-NEXT: [[AND_6:%.*]] = and i32 [[SHL_6]], [[X]] +; CHECK-NEXT: [[TOBOOL_6:%.*]] = icmp eq i32 [[AND_6]], 0 +; CHECK-NEXT: [[CONV_6:%.*]] = select i1 [[TOBOOL_6]], i8 48, i8 49 +; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i8, i8* [[S]], i64 [[INDVARS_IV_NEXT_5]] +; CHECK-NEXT: store i8 [[CONV_6]], i8* [[ARRAYIDX_6]], align 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT_6:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_5]], 1 +; CHECK-NEXT: [[TMP35:%.*]] = trunc i64 [[INDVARS_IV_NEXT_6]] to i32 +; CHECK-NEXT: [[SHL_7:%.*]] = shl i32 1, [[TMP35]] +; CHECK-NEXT: [[AND_7:%.*]] = and i32 [[SHL_7]], [[X]] +; CHECK-NEXT: [[TOBOOL_7:%.*]] = icmp eq i32 [[AND_7]], 0 +; CHECK-NEXT: [[CONV_7:%.*]] = select i1 [[TOBOOL_7]], i8 48, i8 49 +; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i8, i8* [[S]], i64 [[INDVARS_IV_NEXT_6]] +; CHECK-NEXT: store i8 [[CONV_7]], i8* [[ARRAYIDX_7]], align 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV_NEXT_6]], 1 +; CHECK-NEXT: [[EXITCOND_7:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_7]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_7]], label [[FOR_END_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY]] +; CHECK: for.end.loopexit.unr-lcssa: +; CHECK-NEXT: br label [[FOR_END_LOOPEXIT]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: [[IDXPROM1:%.*]] = sext i32 [[K]] to i64 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[S]], i64 [[IDXPROM1]] +; CHECK-NEXT: store i8 0, i8* [[ARRAYIDX2]], align 1 +; CHECK-NEXT: ret i8* [[S]] +; entry: %cmp10 = icmp sgt i32 %k, 0 br i1 %cmp10, label %for.body.lr.ph, label %for.end @@ -64,11 +254,3 @@ for.end: ; preds = %for.body, %middle.b ret i8* %s } - -; CHECK-LABEL: vector.body -; CHECK: shl -; CHECK-NEXT: and -; CHECK: shl -; CHECK-NEXT: and -; CHECK: label %vector.body - diff --git a/llvm/test/Transforms/Reassociate/cse-pairs.ll b/llvm/test/Transforms/Reassociate/cse-pairs.ll new file mode 100644 index 00000000000000..33397ea050c417 --- /dev/null +++ b/llvm/test/Transforms/Reassociate/cse-pairs.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -reassociate -early-cse -S < %s | FileCheck %s + +@num1 = local_unnamed_addr global i32 0, align 4 +@num2 = local_unnamed_addr global i32 0, align 4 +@num3 = local_unnamed_addr global i32 0, align 4 +@num4 = local_unnamed_addr global i32 0, align 4 + +define signext i32 @twoPairs(i32 signext %0, i32 signext %1, i32 signext %2, i32 signext %3, i32 signext %4) { +; CHECK-LABEL: @twoPairs( +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2:%.*]], [[TMP0:%.*]] +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], [[TMP1:%.*]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], [[TMP3:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], [[TMP4:%.*]] +; CHECK-NEXT: store i32 [[TMP9]], i32* @num1, align 4 +; CHECK-NEXT: store i32 [[TMP6]], i32* @num2, align 4 +; CHECK-NEXT: [[TMP10:%.*]] = add nsw i32 [[TMP3]], [[TMP1]] +; CHECK-NEXT: store i32 [[TMP10]], i32* @num3, align 4 +; CHECK-NEXT: ret i32 undef +; + %6 = add i32 %2, %0 + %7 = add i32 %6, %1 + %8 = add i32 %7, %3 + %9 = add i32 %8, %4 + store i32 %9, i32* @num1, align 4 + %10 = add nsw i32 %2, %0 + store i32 %10, i32* @num2, align 4 + %11 = add nsw i32 %3, %1 + store i32 %11, i32* @num3, align 4 + ret i32 undef +} + +define signext i32 @twoPairsAllOpInPairs(i32 signext %0, i32 signext %1, i32 signext %2, i32 signext %3) { +; CHECK-LABEL: @twoPairsAllOpInPairs( +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP2:%.*]], [[TMP1:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], [[TMP0:%.*]] +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], [[TMP3:%.*]] +; CHECK-NEXT: store i32 [[TMP7]], i32* @num1, align 4 +; CHECK-NEXT: store i32 [[TMP5]], i32* @num2, align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add nsw i32 [[TMP3]], [[TMP0]] +; CHECK-NEXT: store i32 [[TMP8]], i32* @num3, align 4 +; CHECK-NEXT: ret i32 undef +; + %5 = add nsw i32 %0, %1 + %6 = add nsw i32 %5, %2 + %7 = add nsw i32 %6, %3 + store i32 %7, i32* @num1, align 4 + %8 = add nsw i32 %1, %2 + store i32 %8, i32* @num2, align 4 + %9 = add nsw i32 %0, %3 + store i32 %9, i32* @num3, align 4 + ret i32 undef +} + +define signext i32 @threePairsAllOpInPairs(i32 signext %0, i32 signext %1, i32 signext %2, i32 signext %3, i32 signext %4, i32 signext %5) { +; CHECK-LABEL: @threePairsAllOpInPairs( +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP3:%.*]], [[TMP2:%.*]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], [[TMP0:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], [[TMP1:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP4:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP5:%.*]] +; CHECK-NEXT: store i32 [[TMP11]], i32* @num1, align 4 +; CHECK-NEXT: [[TMP12:%.*]] = add nsw i32 [[TMP5]], [[TMP0]] +; CHECK-NEXT: store i32 [[TMP12]], i32* @num2, align 4 +; CHECK-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP4]], [[TMP1]] +; CHECK-NEXT: store i32 [[TMP13]], i32* @num3, align 4 +; CHECK-NEXT: store i32 [[TMP7]], i32* @num4, align 4 +; CHECK-NEXT: ret i32 undef +; + %7 = add nsw i32 %0, %1 + %8 = add nsw i32 %7, %2 + %9 = add nsw i32 %8, %3 + %10 = add nsw i32 %9, %4 + %11 = add nsw i32 %10, %5 + store i32 %11, i32* @num1, align 4 + %12 = add nsw i32 %0, %5 + store i32 %12, i32* @num2, align 4 + %13 = add nsw i32 %1, %4 + store i32 %13, i32* @num3, align 4 + %14 = add nsw i32 %2, %3 + store i32 %14, i32* @num4, align 4 + ret i32 undef +} diff --git a/llvm/test/Transforms/SCCP/binaryops-range-special-cases.ll b/llvm/test/Transforms/SCCP/binaryops-range-special-cases.ll index a354ae0d4d5d1d..f7fdc1ed5e64b4 100644 --- a/llvm/test/Transforms/SCCP/binaryops-range-special-cases.ll +++ b/llvm/test/Transforms/SCCP/binaryops-range-special-cases.ll @@ -7,16 +7,13 @@ define void @sdiv1_cmp_constants(i32 %x) { ; CHECK-NEXT: [[D:%.*]] = sdiv i32 1, [[X:%.*]] ; CHECK-NEXT: [[C_0:%.*]] = icmp slt i32 0, [[D]] ; CHECK-NEXT: call void @use(i1 [[C_0]]) -; CHECK-NEXT: [[C_1:%.*]] = icmp slt i32 1, [[D]] -; CHECK-NEXT: call void @use(i1 [[C_1]]) -; CHECK-NEXT: [[C_2:%.*]] = icmp slt i32 2, [[D]] -; CHECK-NEXT: call void @use(i1 [[C_2]]) +; CHECK-NEXT: call void @use(i1 false) +; CHECK-NEXT: call void @use(i1 false) ; CHECK-NEXT: [[C_3:%.*]] = icmp eq i32 1, [[D]] ; CHECK-NEXT: call void @use(i1 [[C_3]]) ; CHECK-NEXT: [[C_4:%.*]] = icmp eq i32 0, [[D]] ; CHECK-NEXT: call void @use(i1 [[C_4]]) -; CHECK-NEXT: [[C_5:%.*]] = icmp eq i32 2, [[D]] -; CHECK-NEXT: call void @use(i1 [[C_5]]) +; CHECK-NEXT: call void @use(i1 false) ; CHECK-NEXT: ret void ; %d = sdiv i32 1, %x @@ -47,8 +44,7 @@ define void @sdiv1_cmp_range_1(i32 %x, i1 %c) { ; CHECK: bb3: ; CHECK-NEXT: [[P:%.*]] = phi i32 [ 1, [[BB1]] ], [ 2, [[BB2]] ] ; CHECK-NEXT: [[D:%.*]] = sdiv i32 1, [[X:%.*]] -; CHECK-NEXT: [[C_0:%.*]] = icmp slt i32 [[P]], [[D]] -; CHECK-NEXT: call void @use(i1 [[C_0]]) +; CHECK-NEXT: call void @use(i1 false) ; CHECK-NEXT: [[C_1:%.*]] = icmp eq i32 [[P]], [[D]] ; CHECK-NEXT: call void @use(i1 [[C_1]]) ; CHECK-NEXT: ret void @@ -80,10 +76,8 @@ define void @sdiv1_cmp_range_2(i32 %x, i1 %c) { ; CHECK: bb3: ; CHECK-NEXT: [[P:%.*]] = phi i32 [ 3, [[BB1]] ], [ 2, [[BB2]] ] ; CHECK-NEXT: [[D:%.*]] = sdiv i32 1, [[X:%.*]] -; CHECK-NEXT: [[C_0:%.*]] = icmp slt i32 [[P]], [[D]] -; CHECK-NEXT: call void @use(i1 [[C_0]]) -; CHECK-NEXT: [[C_1:%.*]] = icmp eq i32 [[P]], [[D]] -; CHECK-NEXT: call void @use(i1 [[C_1]]) +; CHECK-NEXT: call void @use(i1 false) +; CHECK-NEXT: call void @use(i1 false) ; CHECK-NEXT: ret void ; br i1 %c, label %bb1, label %bb2 diff --git a/llvm/test/Transforms/SCCP/ip-constant-ranges.ll b/llvm/test/Transforms/SCCP/ip-constant-ranges.ll index dbaedaa739bf40..891bebf105b825 100644 --- a/llvm/test/Transforms/SCCP/ip-constant-ranges.ll +++ b/llvm/test/Transforms/SCCP/ip-constant-ranges.ll @@ -59,12 +59,9 @@ entry: ret i32 %res.2 } -; x is overdefined, because constant ranges are only used for parameter -; values. ; CHECK-LABEL: f3 -; CHECK: %cmp = icmp sgt i32 %x, 300 -; CHECK: %res = select i1 %cmp, i32 1, i32 2 -; CHECK: ret i32 %res +; CHECK-LABEL: entry: +; CHECK: ret i32 undef define internal i32 @f3(i32 %x) { entry: %cmp = icmp sgt i32 %x, 300 @@ -83,7 +80,7 @@ if.true: end: %res = phi i32 [ 0, %entry], [ 1, %if.true ] %call1 = tail call i32 @f3(i32 %res) - ret i32 %call1 + ret i32 2 } ; CHECK-LABEL: f4 diff --git a/llvm/test/Transforms/SCCP/ip-ranges-binaryops.ll b/llvm/test/Transforms/SCCP/ip-ranges-binaryops.ll new file mode 100644 index 00000000000000..cef41bbdb584bd --- /dev/null +++ b/llvm/test/Transforms/SCCP/ip-ranges-binaryops.ll @@ -0,0 +1,134 @@ +; RUN: opt < %s -ipsccp -S | FileCheck %s + +; x = [10, 21), y = [100, 201) +; x + y = [110, 221) +define internal i1 @f.add(i32 %x, i32 %y) { +; CHECK-LABEL: define internal i1 @f.add(i32 %x, i32 %y) { +; CHECK-NEXT: %a.1 = add i32 %x, %y +; CHECK-NEXT: %c.2 = icmp sgt i32 %a.1, 219 +; CHECK-NEXT: %c.4 = icmp slt i32 %a.1, 111 +; CHECK-NEXT: %c.5 = icmp eq i32 %a.1, 150 +; CHECK-NEXT: %c.6 = icmp slt i32 %a.1, 150 +; CHECK-NEXT: %res.1 = add i1 false, %c.2 +; CHECK-NEXT: %res.2 = add i1 %res.1, false +; CHECK-NEXT: %res.3 = add i1 %res.2, %c.4 +; CHECK-NEXT: %res.4 = add i1 %res.3, %c.5 +; CHECK-NEXT: %res.5 = add i1 %res.4, %c.6 +; CHECK-NEXT: ret i1 %res.5 +; + %a.1 = add i32 %x, %y + %c.1 = icmp sgt i32 %a.1, 220 + %c.2 = icmp sgt i32 %a.1, 219 + %c.3 = icmp slt i32 %a.1, 110 + %c.4 = icmp slt i32 %a.1, 111 + %c.5 = icmp eq i32 %a.1, 150 + %c.6 = icmp slt i32 %a.1, 150 + %res.1 = add i1 %c.1, %c.2 + %res.2 = add i1 %res.1, %c.3 + %res.3 = add i1 %res.2, %c.4 + %res.4 = add i1 %res.3, %c.5 + %res.5 = add i1 %res.4, %c.6 + ret i1 %res.5 +} + +define i1 @caller.add() { +; CHECK-LABEL: define i1 @caller.add() { +; CHECK-NEXT: %call.1 = tail call i1 @f.add(i32 10, i32 100) +; CHECK-NEXT: %call.2 = tail call i1 @f.add(i32 20, i32 200) +; CHECK-NEXT: %res = and i1 %call.1, %call.2 +; CHECK-NEXT: ret i1 %res +; + %call.1 = tail call i1 @f.add(i32 10, i32 100) + %call.2 = tail call i1 @f.add(i32 20, i32 200) + %res = and i1 %call.1, %call.2 + ret i1 %res +} + + +; x = [10, 21), y = [100, 201) +; x - y = [-190, -79) +define internal i1 @f.sub(i32 %x, i32 %y) { +; CHECK-LABEL: define internal i1 @f.sub(i32 %x, i32 %y) { +; CHECK-NEXT: %a.1 = sub i32 %x, %y +; CHECK-NEXT: %c.2 = icmp sgt i32 %a.1, -81 +; CHECK-NEXT: %c.4 = icmp slt i32 %a.1, -189 +; CHECK-NEXT: %c.5 = icmp eq i32 %a.1, -150 +; CHECK-NEXT: %c.6 = icmp slt i32 %a.1, -150 +; CHECK-NEXT: %res.1 = add i1 false, %c.2 +; CHECK-NEXT: %res.2 = add i1 %res.1, false +; CHECK-NEXT: %res.3 = add i1 %res.2, %c.4 +; CHECK-NEXT: %res.4 = add i1 %res.3, %c.5 +; CHECK-NEXT: %res.5 = add i1 %res.4, %c.6 +; CHECK-NEXT: ret i1 %res.5 +; + %a.1 = sub i32 %x, %y + %c.1 = icmp sgt i32 %a.1, -80 + %c.2 = icmp sgt i32 %a.1, -81 + %c.3 = icmp slt i32 %a.1, -190 + %c.4 = icmp slt i32 %a.1, -189 + %c.5 = icmp eq i32 %a.1, -150 + %c.6 = icmp slt i32 %a.1, -150 + %res.1 = add i1 %c.1, %c.2 + %res.2 = add i1 %res.1, %c.3 + %res.3 = add i1 %res.2, %c.4 + %res.4 = add i1 %res.3, %c.5 + %res.5 = add i1 %res.4, %c.6 + ret i1 %res.5 +} + +define i1 @caller.sub() { +; CHECK-LABEL: define i1 @caller.sub() { +; CHECK-NEXT: %call.1 = tail call i1 @f.sub(i32 10, i32 100) +; CHECK-NEXT: %call.2 = tail call i1 @f.sub(i32 20, i32 200) +; CHECK-NEXT: %res = and i1 %call.1, %call.2 +; CHECK-NEXT: ret i1 %res +; + %call.1 = tail call i1 @f.sub(i32 10, i32 100) + %call.2 = tail call i1 @f.sub(i32 20, i32 200) + %res = and i1 %call.1, %call.2 + ret i1 %res +} + +; x = [10, 21), y = [100, 201) +; x * y = [1000, 4001) +define internal i1 @f.mul(i32 %x, i32 %y) { +; CHECK-LABEL: define internal i1 @f.mul(i32 %x, i32 %y) { +; CHECK-NEXT: %a.1 = mul i32 %x, %y +; CHECK-NEXT: %c.2 = icmp sgt i32 %a.1, 3999 +; CHECK-NEXT: %c.4 = icmp slt i32 %a.1, 1001 +; CHECK-NEXT: %c.5 = icmp eq i32 %a.1, 1500 +; CHECK-NEXT: %c.6 = icmp slt i32 %a.1, 1500 +; CHECK-NEXT: %res.1 = add i1 false, %c.2 +; CHECK-NEXT: %res.2 = add i1 %res.1, false +; CHECK-NEXT: %res.3 = add i1 %res.2, %c.4 +; CHECK-NEXT: %res.4 = add i1 %res.3, %c.5 +; CHECK-NEXT: %res.5 = add i1 %res.4, %c.6 +; CHECK-NEXT: ret i1 %res.5 +; + %a.1 = mul i32 %x, %y + %c.1 = icmp sgt i32 %a.1, 4000 + %c.2 = icmp sgt i32 %a.1, 3999 + %c.3 = icmp slt i32 %a.1, 1000 + %c.4 = icmp slt i32 %a.1, 1001 + %c.5 = icmp eq i32 %a.1, 1500 + %c.6 = icmp slt i32 %a.1, 1500 + %res.1 = add i1 %c.1, %c.2 + %res.2 = add i1 %res.1, %c.3 + %res.3 = add i1 %res.2, %c.4 + %res.4 = add i1 %res.3, %c.5 + %res.5 = add i1 %res.4, %c.6 + ret i1 %res.5 +} + +define i1 @caller.mul() { +; CHECK-LABEL: define i1 @caller.mul() { +; CHECK-NEXT: %call.1 = tail call i1 @f.mul(i32 10, i32 100) +; CHECK-NEXT: %call.2 = tail call i1 @f.mul(i32 20, i32 200) +; CHECK-NEXT: %res = and i1 %call.1, %call.2 +; CHECK-NEXT: ret i1 %res +; + %call.1 = tail call i1 @f.mul(i32 10, i32 100) + %call.2 = tail call i1 @f.mul(i32 20, i32 200) + %res = and i1 %call.1, %call.2 + ret i1 %res +} diff --git a/llvm/test/Transforms/SCCP/ip-ranges-phis.ll b/llvm/test/Transforms/SCCP/ip-ranges-phis.ll new file mode 100644 index 00000000000000..a4a59d9c0f816b --- /dev/null +++ b/llvm/test/Transforms/SCCP/ip-ranges-phis.ll @@ -0,0 +1,215 @@ +; RUN: opt < %s -ipsccp -S | FileCheck %s + +define internal i32 @f1(i32 %x) { +; CHECK-LABEL: define internal i32 @f1( +; CHECK-NEXT: ret i32 undef +; + %cmp = icmp sgt i32 %x, 300 + %res = select i1 %cmp, i32 1, i32 2 + ret i32 %res +} + +; %res is a constant range [0, 2) from a PHI node. +define i32 @caller1(i1 %cmp) { +; CHECK-LABEL: define i32 @caller1( +; CHECK-LABEL: entry: +; CHECK-NEXT: br i1 %cmp, label %if.true, label %end + +; CHECK-LABEL: if.true: +; CHECK-NEXT: br label %end + +; CHECK-LABEL: end: +; CHECK-NEXT: %res = phi i32 [ 0, %entry ], [ 1, %if.true ] +; CHECK-NEXT: %call1 = tail call i32 @f1(i32 %res) +; CHECK-NEXT: ret i32 2 +; +entry: + br i1 %cmp, label %if.true, label %end + +if.true: + br label %end + +end: + %res = phi i32 [ 0, %entry], [ 1, %if.true ] + %call1 = tail call i32 @f1(i32 %res) + ret i32 %call1 +} + +define internal i32 @f2(i32 %x, i32 %y, i32 %z, i1 %cmp.1, i1 %cmp.2) { +; CHECK-LABEL: define internal i32 @f2( +; CHECK-LABEL: entry: +; CHECK-NEXT: br i1 %cmp.1, label %if.true.1, label %end + +; CHECK-LABEL: if.true.1: +; CHECK-NEXT: br i1 %cmp.2, label %if.true.2, label %end + +; CHECK-LABEL: if.true.2: +; CHECK-NEXT: br label %end + +; CHECK-LABEL: end: +; CHECK-NEXT: %p = phi i32 [ %x, %entry ], [ %y, %if.true.1 ], [ %z, %if.true.2 ] +; CHECK-NEXT: %c.1 = icmp sgt i32 %p, 5 +; CHECK-NEXT: %c.2 = icmp eq i32 %p, 0 +; CHECK-NEXT: %c.3 = icmp slt i32 %p, 0 +; CHECK-NEXT: %v.1 = select i1 %c.1, i32 10, i32 100 +; CHECK-NEXT: %v.2 = select i1 %c.2, i32 20, i32 200 +; CHECK-NEXT: %v.3 = select i1 %c.3, i32 30, i32 300 +; CHECK-NEXT: %r.1 = add i32 %v.1, %v.2 +; CHECK-NEXT: %r.2 = add i32 %r.1, %v.3 +; CHECK-NEXT: %r.3 = add i32 %r.2, 400 +; CHECK-NEXT: %r.4 = add i32 %r.3, 50 +; CHECK-NEXT: %r.5 = add i32 %r.4, 60 +; CHECK-NEXT: %r.6 = add i32 %r.4, 700 +; CHECK-NEXT: ret i32 %r.6 +; +entry: + br i1 %cmp.1, label %if.true.1, label %end + +if.true.1: + br i1 %cmp.2, label %if.true.2, label %end + +if.true.2: + br label %end + +end: + %p = phi i32 [ %x, %entry ], [ %y, %if.true.1 ], [ %z, %if.true.2 ] + %c.1 = icmp sgt i32 %p, 5 + %c.2 = icmp eq i32 %p, 0 + %c.3 = icmp slt i32 %p, 0 + %c.4 = icmp sgt i32 %p, 10 + %c.5 = icmp sle i32 %p, 10 + %c.6 = icmp sgt i32 %p, -11 + %c.7 = icmp slt i32 %p, -11 + %v.1 = select i1 %c.1, i32 10, i32 100 + %v.2 = select i1 %c.2, i32 20, i32 200 + %v.3 = select i1 %c.3, i32 30, i32 300 + %v.4 = select i1 %c.4, i32 40, i32 400 + %v.5 = select i1 %c.5, i32 50, i32 500 + %v.6 = select i1 %c.6, i32 60, i32 600 + %v.7 = select i1 %c.7, i32 70, i32 700 + %r.1 = add i32 %v.1, %v.2 + %r.2 = add i32 %r.1, %v.3 + %r.3 = add i32 %r.2, %v.4 + %r.4 = add i32 %r.3, %v.5 + %r.5 = add i32 %r.4, %v.6 + %r.6 = add i32 %r.4, %v.7 + ret i32 %r.6 +} + +define i32 @caller2(i1 %cmp.1, i1 %cmp.2) { +; CHECK-LABEL: define i32 @caller2(i1 %cmp.1, i1 %cmp.2) { +; CHECK-LABEL: entry: +; CHECK-NEXT: br i1 %cmp.1, label %if.true, label %end + +; CHECK-LABEL: if.true: ; preds = %entry +; CHECK-NEXT: br label %end + +; CHECK-LABEL: end: ; preds = %if.true, %entry +; CHECK-NEXT: %p1 = phi i32 [ 0, %entry ], [ 1, %if.true ] +; CHECK-NEXT: %p2 = phi i32 [ 1, %entry ], [ -10, %if.true ] +; CHECK-NEXT: %p3 = phi i32 [ 1, %entry ], [ 10, %if.true ] +; CHECK-NEXT: %call1 = tail call i32 @f2(i32 %p1, i32 %p2, i32 %p3, i1 %cmp.1, i1 %cmp.2) +; CHECK-NEXT: ret i32 %call1 +; + +entry: + br i1 %cmp.1, label %if.true, label %end + +if.true: + br label %end + +end: + %p1 = phi i32 [ 0, %entry], [ 1, %if.true ] + %p2 = phi i32 [ 1, %entry], [ -10, %if.true ] + %p3 = phi i32 [ 1, %entry], [ 10, %if.true ] + %call1 = tail call i32 @f2(i32 %p1, i32 %p2, i32 %p3, i1 %cmp.1, i1 %cmp.2) + ret i32 %call1 +} + +define internal i32 @f3(i32 %x, i32 %y, i1 %cmp.1) { +; CHECK-LABEL: define internal i32 @f3(i32 %x, i32 %y, i1 %cmp.1) { +; CHECK-LABEL: entry: +; CHECK-NEXT: br i1 %cmp.1, label %if.true.1, label %end + +; CHECK-LABEL: if.true.1: ; preds = %entry +; CHECK-NEXT: br label %end + +; CHECK-LABEL: end: ; preds = %if.true.1, %entry +; CHECK-NEXT: %p = phi i32 [ %x, %entry ], [ %y, %if.true.1 ] +; CHECK-NEXT: %c.1 = icmp sgt i32 %p, 5 +; CHECK-NEXT: %c.2 = icmp eq i32 %p, 0 +; CHECK-NEXT: %c.3 = icmp slt i32 %p, 0 +; CHECK-NEXT: %c.4 = icmp sgt i32 %p, 10 +; CHECK-NEXT: %c.5 = icmp sle i32 %p, 10 +; CHECK-NEXT: %c.6 = icmp sgt i32 %p, -11 +; CHECK-NEXT: %c.7 = icmp slt i32 %p, -11 +; CHECK-NEXT: %v.1 = select i1 %c.1, i32 10, i32 100 +; CHECK-NEXT: %v.2 = select i1 %c.2, i32 20, i32 200 +; CHECK-NEXT: %v.3 = select i1 %c.3, i32 30, i32 300 +; CHECK-NEXT: %v.4 = select i1 %c.4, i32 40, i32 400 +; CHECK-NEXT: %v.5 = select i1 %c.5, i32 50, i32 500 +; CHECK-NEXT: %v.6 = select i1 %c.6, i32 60, i32 600 +; CHECK-NEXT: %v.7 = select i1 %c.7, i32 70, i32 700 +; CHECK-NEXT: %r.1 = add i32 %v.1, %v.2 +; CHECK-NEXT: %r.2 = add i32 %r.1, %v.3 +; CHECK-NEXT: %r.3 = add i32 %r.2, %v.4 +; CHECK-NEXT: %r.4 = add i32 %r.3, %v.5 +; CHECK-NEXT: %r.5 = add i32 %r.4, %v.6 +; CHECK-NEXT: %r.6 = add i32 %r.4, %v.7 +; CHECK-NEXT: ret i32 %r.6 +; +entry: + br i1 %cmp.1, label %if.true.1, label %end + +if.true.1: + br label %end + +end: + %p = phi i32 [ %x, %entry ], [ %y, %if.true.1 ] + %c.1 = icmp sgt i32 %p, 5 + %c.2 = icmp eq i32 %p, 0 + %c.3 = icmp slt i32 %p, 0 + %c.4 = icmp sgt i32 %p, 10 + %c.5 = icmp sle i32 %p, 10 + %c.6 = icmp sgt i32 %p, -11 + %c.7 = icmp slt i32 %p, -11 + %v.1 = select i1 %c.1, i32 10, i32 100 + %v.2 = select i1 %c.2, i32 20, i32 200 + %v.3 = select i1 %c.3, i32 30, i32 300 + %v.4 = select i1 %c.4, i32 40, i32 400 + %v.5 = select i1 %c.5, i32 50, i32 500 + %v.6 = select i1 %c.6, i32 60, i32 600 + %v.7 = select i1 %c.7, i32 70, i32 700 + %r.1 = add i32 %v.1, %v.2 + %r.2 = add i32 %r.1, %v.3 + %r.3 = add i32 %r.2, %v.4 + %r.4 = add i32 %r.3, %v.5 + %r.5 = add i32 %r.4, %v.6 + %r.6 = add i32 %r.4, %v.7 + ret i32 %r.6 +} + +define i32 @caller3(i32 %y, i1 %cmp.1) { +; CHECK-LABEL: define i32 @caller3(i32 %y, i1 %cmp.1) { +; CHECK-LABEL: entry: +; CHECK-NEXT: br i1 %cmp.1, label %if.true, label %end + +; CHECK-LABEL: if.true: +; CHECK-NEXT: br label %end + +; CHECK-LABEL: end: +; CHECK-NEXT: %p1 = phi i32 [ 0, %entry ], [ 5, %if.true ] +; CHECK-NEXT: %call1 = tail call i32 @f3(i32 %p1, i32 %y, i1 %cmp.1) +; CHECK-NEXT: ret i32 %call1 +; +entry: + br i1 %cmp.1, label %if.true, label %end + +if.true: + br label %end + +end: + %p1 = phi i32 [ 0, %entry], [ 5, %if.true ] + %call1 = tail call i32 @f3(i32 %p1, i32 %y, i1 %cmp.1) + ret i32 %call1 +} diff --git a/llvm/test/Transforms/SCCP/phis.ll b/llvm/test/Transforms/SCCP/phis.ll new file mode 100644 index 00000000000000..dac8273ab2d1d5 --- /dev/null +++ b/llvm/test/Transforms/SCCP/phis.ll @@ -0,0 +1,81 @@ +; RUN: opt < %s -sccp -S | FileCheck %s + +define i1 @float.1(i1 %cmp) { +; CHECK-LABEL: define i1 @float.1(i1 %cmp) { + +; CHECK-LABEL: end: +; CHECK-NEXT: ret i1 true +; +entry: + br i1 %cmp, label %if.true, label %end + +if.true: + br label %end + +end: + %p = phi float [ 1.0, %entry ], [ 1.0, %if.true] + %c = fcmp ueq float %p, 1.0 + ret i1 %c +} + +define i1 @float.2(i1 %cmp) { +; CHECK-LABEL: define i1 @float.2(i1 %cmp) { + +; CHECK-LABEL: end: +; CHECK-NEXT: %p = phi float [ 1.000000e+00, %entry ], [ 2.000000e+00, %if.true ] +; CHECK-NEXT: %c = fcmp ueq float %p, 1.000000e+00 +; CHECK-NEXT: ret i1 %c +; +entry: + br i1 %cmp, label %if.true, label %end + +if.true: + br label %end + +end: + %p = phi float [ 1.0, %entry ], [ 2.0, %if.true] + %c = fcmp ueq float %p, 1.0 + ret i1 %c +} + +define i1 @float.3(float %f, i1 %cmp) { +; CHECK-LABEL: define i1 @float.3(float %f, i1 %cmp) + +; CHECK-LABEL: end: +; CHECK-NEXT: %p = phi float [ 1.000000e+00, %entry ], [ %f, %if.true ] +; CHECK-NEXT: %c = fcmp ueq float %p, 1.000000e+00 +; CHECK-NEXT: ret i1 %c +; +entry: + br i1 %cmp, label %if.true, label %end + +if.true: + br label %end + +end: + %p = phi float [ 1.0, %entry ], [ %f, %if.true] + %c = fcmp ueq float %p, 1.0 + ret i1 %c +} + + +define i1 @float.4_unreachable(float %f, i1 %cmp) { +; CHECK-LABEL: define i1 @float.4_unreachable(float %f, i1 %cmp) + +; CHECK-LABEL: end: +; CHECK-NEXT: ret i1 false +; +entry: + br i1 %cmp, label %if.true, label %end + +if.true: + br label %end + +dead: + br label %end + +end: + %p = phi float [ 1.0, %entry ], [ 1.0, %if.true], [ %f, %dead ] + %c = fcmp une float %p, 1.0 + ret i1 %c +} diff --git a/llvm/test/Transforms/SCCP/range-and.ll b/llvm/test/Transforms/SCCP/range-and.ll index e948274dd8f14d..3b349692db7042 100644 --- a/llvm/test/Transforms/SCCP/range-and.ll +++ b/llvm/test/Transforms/SCCP/range-and.ll @@ -8,16 +8,13 @@ define void @and_range_limit(i64 %a) { ; CHECK-NEXT: [[R:%.*]] = and i64 [[A:%.*]], 255 ; CHECK-NEXT: [[C_0:%.*]] = icmp slt i64 [[R]], 15 ; CHECK-NEXT: call void @use(i1 [[C_0]]) -; CHECK-NEXT: [[C_1:%.*]] = icmp slt i64 [[R]], 256 -; CHECK-NEXT: call void @use(i1 [[C_1]]) +; CHECK-NEXT: call void @use(i1 true) ; CHECK-NEXT: [[C_2:%.*]] = icmp eq i64 [[R]], 100 ; CHECK-NEXT: call void @use(i1 [[C_2]]) -; CHECK-NEXT: [[C_3:%.*]] = icmp eq i64 [[R]], 300 -; CHECK-NEXT: call void @use(i1 [[C_3]]) +; CHECK-NEXT: call void @use(i1 false) ; CHECK-NEXT: [[C_4:%.*]] = icmp ne i64 [[R]], 100 ; CHECK-NEXT: call void @use(i1 [[C_4]]) -; CHECK-NEXT: [[C_5:%.*]] = icmp ne i64 [[R]], 300 -; CHECK-NEXT: call void @use(i1 [[C_5]]) +; CHECK-NEXT: call void @use(i1 true) ; CHECK-NEXT: ret void ; %r = and i64 %a, 255 @@ -144,8 +141,7 @@ define i1 @constant_range_and_255_100(i1 %cond, i64 %a) { ; CHECK: bb3: ; CHECK-NEXT: [[P:%.*]] = phi i64 [ [[R_1]], [[BB1]] ], [ [[R_2]], [[BB2]] ] ; CHECK-NEXT: [[P_AND:%.*]] = and i64 [[P]], 512 -; CHECK-NEXT: [[C:%.*]] = icmp ult i64 [[P_AND]], 256 -; CHECK-NEXT: ret i1 [[C]] +; CHECK-NEXT: ret i1 true ; entry: br i1 %cond, label %bb1, label %bb2 diff --git a/llvm/test/Transforms/SCCP/vector-bitcast.ll b/llvm/test/Transforms/SCCP/vector-bitcast.ll index b032085083c60b..35312034c65b83 100644 --- a/llvm/test/Transforms/SCCP/vector-bitcast.ll +++ b/llvm/test/Transforms/SCCP/vector-bitcast.ll @@ -2,7 +2,8 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128" -; CHECK: store volatile <2 x i64> zeroinitializer, <2 x i64>* %p +; FIXME: Add back support for handling special values of vector/fp types. +; CHECK: store volatile <2 x i64> %and.i119.i, <2 x i64>* %p ; rdar://11324230 define void @foo(<2 x i64>* %p) nounwind { diff --git a/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll b/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll index e72413e8b308ba..93fe8a2019079f 100644 --- a/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll +++ b/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll @@ -11,14 +11,15 @@ define double @_Z3fooRdS_S_S_(double* dereferenceable(8) %x, double* dereference ; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq double [[TMP0]], 0.000000e+00 ; CHECK-NEXT: [[TMP1:%.*]] = load double, double* [[X:%.*]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load double, double* [[A:%.*]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast double [[TMP1]], [[TMP2]] ; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast double [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[MUL:%.*]] = fadd fast double 1.000000e+00, [[TMP3]] ; CHECK-NEXT: store double [[MUL]], double* [[Y]], align 8 ; CHECK-NEXT: br label [[IF_END:%.*]] ; CHECK: if.else: -; CHECK-NEXT: [[SUB1:%.*]] = fsub fast double [[TMP3]], [[TMP0]] +; CHECK-NEXT: [[MUL1:%.*]] = fmul fast double [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[SUB1:%.*]] = fsub fast double [[MUL1]], [[TMP0]] ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr double, double* [[Y]], i32 1 ; CHECK-NEXT: store double [[SUB1]], double* [[GEP1]], align 8 ; CHECK-NEXT: br label [[IF_END]] diff --git a/llvm/test/Verifier/vp-intrinsics.ll b/llvm/test/Verifier/vp-intrinsics.ll new file mode 100644 index 00000000000000..0e9f4e01561d9b --- /dev/null +++ b/llvm/test/Verifier/vp-intrinsics.ll @@ -0,0 +1,34 @@ +; RUN: opt --verify %s + +define void @test_vp_int(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) { + %r0 = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r1 = call <8 x i32> @llvm.vp.sub.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r2 = call <8 x i32> @llvm.vp.mul.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r3 = call <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r4 = call <8 x i32> @llvm.vp.srem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r5 = call <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r6 = call <8 x i32> @llvm.vp.urem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r7 = call <8 x i32> @llvm.vp.and.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r8 = call <8 x i32> @llvm.vp.or.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r9 = call <8 x i32> @llvm.vp.xor.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %rA = call <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %rB = call <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %rC = call <8 x i32> @llvm.vp.shl.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + ret void +} + +; integer arith +declare <8 x i32> @llvm.vp.add.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.sub.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.mul.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.srem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.urem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +; bit arith +declare <8 x i32> @llvm.vp.and.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.or.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.xor.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.shl.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) diff --git a/llvm/test/tools/llvm-dwp/X86/duplicate.test b/llvm/test/tools/llvm-dwp/X86/duplicate.test index 43266a24b6014c..de5f1fdd4231fd 100644 --- a/llvm/test/tools/llvm-dwp/X86/duplicate.test +++ b/llvm/test/tools/llvm-dwp/X86/duplicate.test @@ -18,10 +18,10 @@ RUN: | FileCheck --check-prefix=DWO1DWP %s Build from a, b, and c.c all containing a single void() func by the name of the file. -DWOS: error: Duplicate DWO ID ({{.*}}) in 'c.c' and 'c.c'{{$}} -1DWP: error: Duplicate DWO ID ({{.*}}) in 'c.c' (from '{{.*}}ac.dwp') and 'c.c'{{$}} -2DWP: error: Duplicate DWO ID ({{.*}}) in 'c.c' and 'c.c' (from '{{.*}}bc.dwp'){{$}} +DWOS: error: duplicate DWO ID ({{.*}}) in 'c.c' and 'c.c'{{$}} +1DWP: error: duplicate DWO ID ({{.*}}) in 'c.c' (from '{{.*}}ac.dwp') and 'c.c'{{$}} +2DWP: error: duplicate DWO ID ({{.*}}) in 'c.c' and 'c.c' (from '{{.*}}bc.dwp'){{$}} -DWODWOS: error: Duplicate DWO ID ({{.*}}) in 'c.c' and 'c.c'{{$}} -DWO1DWP: error: Duplicate DWO ID ({{.*}}) in 'c.c' (from 'c.dwo' in '{{.*}}ac.dwp') and 'c.c'{{$}} -DWO2DWP: error: Duplicate DWO ID ({{.*}}) in 'c.c' and 'c.c' (from 'c.dwo' in '{{.*}}bc.dwp'){{$}} +DWODWOS: error: duplicate DWO ID ({{.*}}) in 'c.c' and 'c.c'{{$}} +DWO1DWP: error: duplicate DWO ID ({{.*}}) in 'c.c' (from 'c.dwo' in '{{.*}}ac.dwp') and 'c.c'{{$}} +DWO2DWP: error: duplicate DWO ID ({{.*}}) in 'c.c' and 'c.c' (from 'c.dwo' in '{{.*}}bc.dwp'){{$}} diff --git a/llvm/test/tools/llvm-dwp/X86/gcc_type.test b/llvm/test/tools/llvm-dwp/X86/gcc_type.test index 7bb1a64cd2df07..eb8f2ba9fd3740 100644 --- a/llvm/test/tools/llvm-dwp/X86/gcc_type.test +++ b/llvm/test/tools/llvm-dwp/X86/gcc_type.test @@ -5,4 +5,4 @@ CHECK: Type Unit CHECK: Type Unit // Check that llvm-dwp can parse DW_FORM_string for CU name -DUP: Duplicate DWO ID ({{.*}}) in 'a.cpp' and 'a.cpp' +DUP: duplicate DWO ID ({{.*}}) in 'a.cpp' and 'a.cpp' diff --git a/llvm/test/tools/llvm-dwp/X86/invalid_cu_index.test b/llvm/test/tools/llvm-dwp/X86/invalid_cu_index.test index 6fc96b2a1a3012..92c9a12cb91e4c 100644 --- a/llvm/test/tools/llvm-dwp/X86/invalid_cu_index.test +++ b/llvm/test/tools/llvm-dwp/X86/invalid_cu_index.test @@ -1,3 +1,3 @@ RUN: not llvm-dwp %p/../Inputs/invalid_cu_index/x.dwp -o %t 2>&1 | FileCheck %s -CHECK: error: Failed to parse cu_index +CHECK: error: failed to parse cu_index diff --git a/llvm/test/tools/llvm-dwp/X86/missing_tu_index.test b/llvm/test/tools/llvm-dwp/X86/missing_tu_index.test index 99f5253e8b3238..b84ed7c0765c33 100644 --- a/llvm/test/tools/llvm-dwp/X86/missing_tu_index.test +++ b/llvm/test/tools/llvm-dwp/X86/missing_tu_index.test @@ -1,3 +1,3 @@ RUN: not llvm-dwp %p/../Inputs/missing_tu_index/x.dwp -o %t 2>&1 | FileCheck %s -CHECK: error: Failed to parse tu_index +CHECK: error: failed to parse tu_index diff --git a/llvm/test/tools/llvm-locstats/locstats.ll b/llvm/test/tools/llvm-locstats/locstats.ll index f16635d2e8e42a..fd28679f3ec133 100644 --- a/llvm/test/tools/llvm-locstats/locstats.ll +++ b/llvm/test/tools/llvm-locstats/locstats.ll @@ -9,9 +9,9 @@ ; LOCSTATS: [10%,20%) 0 0% ; LOCSTATS: [20%,30%) 1 11% ; LOCSTATS: [30%,40%) 0 0% -; LOCSTATS: [40%,50%) 1 11% -; LOCSTATS: [50%,60%) 1 11% -; LOCSTATS: [60%,70%) 1 11% +; LOCSTATS: [40%,50%) 0 0% +; LOCSTATS: [50%,60%) 0 0% +; LOCSTATS: [60%,70%) 3 33% ; LOCSTATS: [70%,80%) 0 0% ; LOCSTATS: [80%,90%) 2 22% ; LOCSTATS: [90%,100%) 1 11% diff --git a/llvm/test/tools/llvm-objcopy/ELF/Inputs/partitions.elf.test b/llvm/test/tools/llvm-objcopy/ELF/Inputs/partitions.elf.test index 179673848161a4..16be495db9d418 100644 --- a/llvm/test/tools/llvm-objcopy/ELF/Inputs/partitions.elf.test +++ b/llvm/test/tools/llvm-objcopy/ELF/Inputs/partitions.elf.test @@ -1,6 +1,5 @@ // partitions.elf was generated by running this test in lld: -// REQUIRES: x86 // RUN: llvm-mc %s -o %t.o -filetype=obj --triple=x86_64-unknown-linux // RUN: ld.lld %t.o -o %t --export-dynamic --gc-sections diff --git a/llvm/test/tools/obj2yaml/dynamic-section.test b/llvm/test/tools/obj2yaml/dynamic-section.test index 28066dd16adfb8..65eb335746a812 100644 --- a/llvm/test/tools/obj2yaml/dynamic-section.test +++ b/llvm/test/tools/obj2yaml/dynamic-section.test @@ -1,16 +1,15 @@ -# RUN: yaml2obj %s -o %t -# RUN: obj2yaml %t | FileCheck %s +## Check we can use obj2yaml to yamalize the object containing .dynamic +## section. Check that resulting section has the proper attributes and +## dynamic tags and that we do not dump the default sh_entsize. -## Check we can use obj2yaml to yamalize the object containing -## .dynamic section. Check that resulting section has the -## proper attributes and dynamic tags. +# RUN: yaml2obj -DENTSIZE=0x10 %s -o %t1 +# RUN: obj2yaml %t1 | FileCheck %s # CHECK: Sections: # CHECK-NEXT: - Name: .dynamic # CHECK-NEXT: Type: SHT_DYNAMIC # CHECK-NEXT: Address: 0x0000000000001000 # CHECK-NEXT: AddressAlign: 0x0000000000002000 -# CHECK-NEXT: EntSize: 0x0000000000000010 # CHECK-NEXT: Entries: # CHECK-NEXT: - Tag: DT_NULL # CHECK-NEXT: Value: 0x0000000000000000 @@ -136,7 +135,7 @@ Sections: Type: SHT_DYNAMIC Address: 0x0000000000001000 AddressAlign: 0x0000000000002000 - EntSize: 0x0000000000000010 + EntSize: [[ENTSIZE]] Entries: - Tag: DT_NULL Value: 0x0000000000000000 @@ -250,3 +249,11 @@ Sections: Value: 0x0000000000000036 - Tag: DT_USED Value: 0x0000000000000001 + +## Test the behavior when sh_entsize is invalid. +## Here we use 0xFE as an arbitrary broken value instead of expected 0x16. + +# RUN: yaml2obj -DENTSIZE=0xff %s -o %t2 +# RUN: not obj2yaml %t2 2>&1 | FileCheck %s --check-prefix=ENTSIZE + +# ENTSIZE: section [index 1] has an invalid sh_entsize: 255 diff --git a/llvm/test/tools/obj2yaml/elf-ppc64-relocations.yaml b/llvm/test/tools/obj2yaml/elf-ppc64-relocations.yaml index 512b71ea4d833b..e476242eb1c679 100644 --- a/llvm/test/tools/obj2yaml/elf-ppc64-relocations.yaml +++ b/llvm/test/tools/obj2yaml/elf-ppc64-relocations.yaml @@ -10,9 +10,8 @@ # CHECK-NEXT: Type: ET_REL # CHECK-NEXT: Machine: EM_PPC64 # CHECK-NEXT: Sections: -# CHECK-NEXT: - Name: .rela.text -# CHECK-NEXT: Type: SHT_RELA -# CHECK-NEXT: EntSize: 0x0000000000000018 +# CHECK-NEXT: - Name: .rela.text +# CHECK-NEXT: Type: SHT_RELA # CHECK-NEXT: Relocations: # CHECK-NEXT: - Type: R_PPC64_NONE # CHECK-NEXT: - Type: R_PPC64_ADDR32 diff --git a/llvm/test/tools/obj2yaml/elf-reladyn-section-shinfo.yaml b/llvm/test/tools/obj2yaml/elf-reladyn-section-shinfo.yaml index 0f2906470f3ded..8863ac8c9e9bd1 100644 --- a/llvm/test/tools/obj2yaml/elf-reladyn-section-shinfo.yaml +++ b/llvm/test/tools/obj2yaml/elf-reladyn-section-shinfo.yaml @@ -19,11 +19,11 @@ # CHECK-NEXT: AddressAlignment: # CHECK-NEXT: EntrySize: -# YAML: - Name: .rela.dyn -# YAML-NEXT: Type: SHT_RELA -# YAML-NEXT: Flags: [ SHF_ALLOC ] -# YAML-NEXT: Link: .dynsym -# YAML-NEXT: EntSize: 0x0000000000000018 +# YAML: - Name: .rela.dyn +# YAML-NEXT: Type: SHT_RELA +# YAML-NEXT: Flags: [ SHF_ALLOC ] +# YAML-NEXT: Link: .dynsym +# YAML-NEXT: - Name: --- !ELF FileHeader: @@ -37,7 +37,6 @@ Sections: Type: SHT_RELA Flags: [ SHF_ALLOC ] Link: .dynsym - EntSize: 0x0000000000000018 # Add at least one symbol to trigger the .dynsym emission. DynamicSymbols: - Name: bar diff --git a/llvm/test/tools/obj2yaml/no-symbol-reloc.test b/llvm/test/tools/obj2yaml/no-symbol-reloc.test index 97800491a7afa4..8940f6a9e8b79f 100644 --- a/llvm/test/tools/obj2yaml/no-symbol-reloc.test +++ b/llvm/test/tools/obj2yaml/no-symbol-reloc.test @@ -16,7 +16,6 @@ # CHECK-NEXT: Flags: [ SHF_ALLOC, SHF_EXECINSTR ] # CHECK-NEXT: - Name: .rela.text # CHECK-NEXT: Type: SHT_RELA -# CHECK-NEXT: EntSize: 0x0000000000000018 # CHECK-NEXT: Info: .text # CHECK-NEXT: Relocations: # CHECK-NEXT: - Type: R_X86_64_NONE diff --git a/llvm/test/tools/obj2yaml/rel-rela-section.yaml b/llvm/test/tools/obj2yaml/rel-rela-section.yaml new file mode 100644 index 00000000000000..6bef1d30cad8d8 --- /dev/null +++ b/llvm/test/tools/obj2yaml/rel-rela-section.yaml @@ -0,0 +1,49 @@ +## This is a generic test for SHT_REL/SHT_RELA sections. + +## Check that we do not print excessive default +## fields for SHT_REL[A] sections. +# RUN: yaml2obj %s -o %t1 +# RUN: obj2yaml %t1 | FileCheck %s --check-prefix=YAML + +## Note: it is important to have at least two sections with sh_info == 0. +## Previously we printed a broken Info field in this case. +# YAML: - Name: .rela.dyn +# YAML-NEXT: Type: SHT_RELA +# YAML-NEXT: - Name: .rel.dyn +# YAML-NEXT: Type: SHT_REL +# YAML-NEXT: - Name + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .rela.dyn + Type: SHT_RELA + - Name: .rel.dyn + Type: SHT_REL +## Trigger the .dynsym emission. +DynamicSymbols: [] + +## Test the behavior when the sh_entsize field is broken. +## Here we use the 0xFE value instead of expected 0x18/0x10. + +# RUN: yaml2obj -DTYPE=SHT_RELA --docnum=2 %s -o %t2.rela +# RUN: not obj2yaml %t2.rela 2>&1 | FileCheck %s --check-prefix=ERR +# RUN: yaml2obj -DTYPE=SHT_REL --docnum=2 %s -o %t2.rel +# RUN: not obj2yaml %t2.rel 2>&1 | FileCheck %s --check-prefix=ERR + +# ERR: section [index 1] has an invalid sh_entsize: 254 + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .foo + Type: [[TYPE]] + EntSize: 0xFE diff --git a/llvm/test/tools/obj2yaml/relocation-type.yaml b/llvm/test/tools/obj2yaml/relocation-type.yaml index 6ab8e9c462ea1a..d069abe68edaec 100644 --- a/llvm/test/tools/obj2yaml/relocation-type.yaml +++ b/llvm/test/tools/obj2yaml/relocation-type.yaml @@ -13,7 +13,6 @@ # CHECK-NEXT: Sections: # CHECK-NEXT: - Name: .rela.text # CHECK-NEXT: Type: SHT_RELA -# CHECK-NEXT: EntSize: 0x0000000000000018 # CHECK-NEXT: Relocations: # CHECK-NEXT: - Offset: 0x0000000000000009 # CHECK-NEXT: Type: [[FIRST]] diff --git a/llvm/test/tools/obj2yaml/relr-section.yaml b/llvm/test/tools/obj2yaml/relr-section.yaml index 37ddf2c2e67163..3134fcc22abe7e 100644 --- a/llvm/test/tools/obj2yaml/relr-section.yaml +++ b/llvm/test/tools/obj2yaml/relr-section.yaml @@ -1,7 +1,8 @@ ## Test how we dump SHT_RELR sections for 32 and 64-bit targets. -## Test we use the "Entries" property when it is possible do -## dump values correctly. +## Test we use the "Entries" property when it is possible to +## dump values correctly. Also, check we do not dump sh_entsize when +## it has the default value. # RUN: yaml2obj --docnum=1 -D BITS=32 -D ENCODE=LSB %s -o %t.32le # RUN: obj2yaml %t.32le | FileCheck %s --check-prefix=ELF32LE @@ -15,25 +16,21 @@ # ELF64LE: Sections: # ELF64LE-NEXT: - Name: .relr.dyn # ELF64LE-NEXT: Type: SHT_RELR -# ELF64LE-NEXT: EntSize: 0x0000000000000008 # ELF64LE-NEXT: Entries: [ 0x8877665544332211 ] # ELF32LE: Sections: # ELF32LE-NEXT: - Name: .relr.dyn # ELF32LE-NEXT: Type: SHT_RELR -# ELF32LE-NEXT: EntSize: 0x0000000000000004 # ELF32LE-NEXT: Entries: [ 0x0000000044332211, 0x0000000088776655 ] # ELF64BE: Sections: # ELF64BE-NEXT: - Name: .relr.dyn # ELF64BE-NEXT: Type: SHT_RELR -# ELF64BE-NEXT: EntSize: 0x0000000000000008 # ELF64BE-NEXT: Entries: [ 0x1122334455667788 ] # ELF32BE: Sections: # ELF32BE-NEXT: - Name: .relr.dyn # ELF32BE-NEXT: Type: SHT_RELR -# ELF32BE-NEXT: EntSize: 0x0000000000000004 # ELF32BE-NEXT: Entries: [ 0x0000000011223344, 0x0000000055667788 ] --- !ELF @@ -54,7 +51,6 @@ Sections: # CONTENT: - Name: .relr.dyn # CONTENT-NEXT: Type: SHT_RELR -# CONTENT-NEXT: EntSize: 0x0000000000000008 # CONTENT-NEXT: Content: '11223344556677' --- !ELF @@ -67,3 +63,27 @@ Sections: - Name: .relr.dyn Type: SHT_RELR Content: "11223344556677" + +## Test we are able to dump a SHT_RELR section when sh_entsize is invalid. +## Here we use 0xFE as a value instead of expected 0x8. + +# RUN: yaml2obj --docnum=3 %s -o %t.entsize +# RUN: obj2yaml %t.entsize | FileCheck %s --check-prefix=ENTSIZE + +# ENTSIZE: - Name: .relr.dyn +# ENTSIZE-NEXT: Type: SHT_RELR +# ENTSIZE-NEXT: EntSize: 0x00000000000000FE +# ENTSIZE-NEXT: Content: '1122334455667788' +# ENTSIZE-NEXT: ... + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2MSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .relr.dyn + Type: SHT_RELR + EntSize: 0xFE + Content: "1122334455667788" diff --git a/llvm/tools/llvm-dwp/llvm-dwp.cpp b/llvm/tools/llvm-dwp/llvm-dwp.cpp index df524ad0a2e8f2..51b3470afee453 100644 --- a/llvm/tools/llvm-dwp/llvm-dwp.cpp +++ b/llvm/tools/llvm-dwp/llvm-dwp.cpp @@ -479,7 +479,7 @@ static Error buildDuplicateError(const std::pair &PrevE, const CompileUnitIdentifiers &ID, StringRef DWPName) { return make_error( - std::string("Duplicate DWO ID (") + utohexstr(PrevE.first) + ") in " + + std::string("duplicate DWO ID (") + utohexstr(PrevE.first) + ") in " + buildDWODescription(PrevE.second.Name, PrevE.second.DWPName, PrevE.second.DWOName) + " and " + buildDWODescription(ID.Name, DWPName, ID.DWOName)); @@ -596,7 +596,7 @@ static Error write(MCStreamer &Out, ArrayRef Inputs) { DWARFUnitIndex CUIndex(DW_SECT_INFO); DataExtractor CUIndexData(CurCUIndexSection, Obj.isLittleEndian(), 0); if (!CUIndex.parse(CUIndexData)) - return make_error("Failed to parse cu_index"); + return make_error("failed to parse cu_index"); for (const DWARFUnitIndex::Entry &E : CUIndex.getRows()) { auto *I = E.getOffsets(); @@ -631,7 +631,7 @@ static Error write(MCStreamer &Out, ArrayRef Inputs) { DWARFUnitIndex TUIndex(DW_SECT_TYPES); DataExtractor TUIndexData(CurTUIndexSection, Obj.isLittleEndian(), 0); if (!TUIndex.parse(TUIndexData)) - return make_error("Failed to parse tu_index"); + return make_error("failed to parse tu_index"); addAllTypesFromDWP(Out, TypeIndexEntries, TUIndex, TypesSection, CurTypesSection.front(), CurEntry, ContributionOffsets[DW_SECT_TYPES - DW_SECT_INFO]); diff --git a/llvm/tools/obj2yaml/elf2yaml.cpp b/llvm/tools/obj2yaml/elf2yaml.cpp index 180457bb6d91eb..b7fb99fa3431ea 100644 --- a/llvm/tools/obj2yaml/elf2yaml.cpp +++ b/llvm/tools/obj2yaml/elf2yaml.cpp @@ -521,6 +521,22 @@ Error ELFDumper::dumpRelocation(const RelT *Rel, const Elf_Shdr *SymTab, return Error::success(); } +template +static unsigned getDefaultShEntSize(ELFYAML::ELF_SHT SecType) { + switch (SecType) { + case ELF::SHT_REL: + return sizeof(typename ELFT::Rel); + case ELF::SHT_RELA: + return sizeof(typename ELFT::Rela); + case ELF::SHT_RELR: + return sizeof(typename ELFT::Relr); + case ELF::SHT_DYNAMIC: + return sizeof(typename ELFT::Dyn); + default: + return 0; + } +} + template Error ELFDumper::dumpCommonSection(const Elf_Shdr *Shdr, ELFYAML::Section &S) { @@ -532,7 +548,8 @@ Error ELFDumper::dumpCommonSection(const Elf_Shdr *Shdr, if (Shdr->sh_addr) S.Address = static_cast(Shdr->sh_addr); S.AddressAlign = Shdr->sh_addralign; - if (Shdr->sh_entsize) + + if (Shdr->sh_entsize != getDefaultShEntSize(S.Type)) S.EntSize = static_cast(Shdr->sh_entsize); auto NameOrErr = getUniquedSectionName(Shdr); @@ -575,6 +592,11 @@ Error ELFDumper::dumpCommonRelocationSection( if (Error E = dumpCommonSection(Shdr, S)) return E; + // Having a zero sh_info field is normal: .rela.dyn is a dynamic + // relocation section that normally has no value in this field. + if (!Shdr->sh_info) + return Error::success(); + auto InfoSection = Obj.getSection(Shdr->sh_info); if (!InfoSection) return InfoSection.takeError(); diff --git a/llvm/unittests/IR/CMakeLists.txt b/llvm/unittests/IR/CMakeLists.txt index 9dba01db1d11d1..05df08d46b29bb 100644 --- a/llvm/unittests/IR/CMakeLists.txt +++ b/llvm/unittests/IR/CMakeLists.txt @@ -41,6 +41,7 @@ add_llvm_unittest(IRTests ValueTest.cpp VectorTypesTest.cpp VerifierTest.cpp + VPIntrinsicTest.cpp WaymarkTest.cpp ) diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp new file mode 100644 index 00000000000000..919bac4ef266d7 --- /dev/null +++ b/llvm/unittests/IR/VPIntrinsicTest.cpp @@ -0,0 +1,151 @@ +//===- VPIntrinsicTest.cpp - VPIntrinsic unit tests ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallVector.h" +#include "llvm/AsmParser/Parser.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Support/SourceMgr.h" +#include "gtest/gtest.h" + +using namespace llvm; + +namespace { + +class VPIntrinsicTest : public testing::Test { +protected: + LLVMContext Context; + + VPIntrinsicTest() : Context() {} + + LLVMContext C; + SMDiagnostic Err; + + std::unique_ptr CreateVPDeclarationModule() { + return parseAssemblyString( +" declare <8 x i32> @llvm.vp.add.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.sub.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.mul.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.srem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.urem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.and.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.xor.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.or.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.shl.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) ", + Err, C); + } +}; + +/// Check that VPIntrinsic:canIgnoreVectorLengthParam() returns true +/// if the vector length parameter does not mask off any lanes. +TEST_F(VPIntrinsicTest, CanIgnoreVectorLength) { + LLVMContext C; + SMDiagnostic Err; + + std::unique_ptr M = + parseAssemblyString( +"declare <256 x i64> @llvm.vp.mul.v256i64(<256 x i64>, <256 x i64>, <256 x i1>, i32)" +"declare @llvm.vp.mul.nxv2i64(, , , i32)" +"declare i32 @llvm.vscale.i32()" +"define void @test_static_vlen( " +" <256 x i64> %i0, %si0," +" <256 x i64> %i1, %si1," +" <256 x i1> %m, %sm, i32 %vl) { " +" %r0 = call <256 x i64> @llvm.vp.mul.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %vl)" +" %r1 = call <256 x i64> @llvm.vp.mul.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 256)" +" %r2 = call <256 x i64> @llvm.vp.mul.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 0)" +" %r3 = call <256 x i64> @llvm.vp.mul.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 7)" +" %r4 = call <256 x i64> @llvm.vp.mul.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 123)" +" %vs = call i32 @llvm.vscale.i32()" +" %vs.i64 = mul i32 %vs, 2" +" %r5 = call @llvm.vp.mul.nxv2i64( %si0, %si1, %sm, i32 %vs.i64)" +" %r6 = call @llvm.vp.mul.nxv2i64( %si0, %si1, %sm, i32 99999)" +" ret void " +"}", + Err, C); + + auto *F = M->getFunction("test_static_vlen"); + assert(F); + + const int NumExpected = 7; + const bool Expected[] = {false, true, false, false, false, true, false}; + int i = 0; + for (auto &I : F->getEntryBlock()) { + VPIntrinsic *VPI = dyn_cast(&I); + if (!VPI) + continue; + + ASSERT_LT(i, NumExpected); + ASSERT_EQ(Expected[i], VPI->canIgnoreVectorLengthParam()); + ++i; + } +} + +/// Check that the argument returned by +/// VPIntrinsic::GetParamPos(Intrinsic::ID) has the expected type. +TEST_F(VPIntrinsicTest, GetParamPos) { + std::unique_ptr M = CreateVPDeclarationModule(); + assert(M); + + for (Function &F : *M) { + ASSERT_TRUE(F.isIntrinsic()); + Optional MaskParamPos = + VPIntrinsic::GetMaskParamPos(F.getIntrinsicID()); + if (MaskParamPos.hasValue()) { + Type *MaskParamType = F.getArg(MaskParamPos.getValue())->getType(); + ASSERT_TRUE(MaskParamType->isVectorTy()); + ASSERT_TRUE(MaskParamType->getVectorElementType()->isIntegerTy(1)); + } + + Optional VecLenParamPos = + VPIntrinsic::GetVectorLengthParamPos(F.getIntrinsicID()); + if (VecLenParamPos.hasValue()) { + Type *VecLenParamType = F.getArg(VecLenParamPos.getValue())->getType(); + ASSERT_TRUE(VecLenParamType->isIntegerTy(32)); + } + } +} + +/// Check that going from Opcode to VP intrinsic and back results in the same +/// Opcode. +TEST_F(VPIntrinsicTest, OpcodeRoundTrip) { + std::vector Opcodes; + Opcodes.reserve(100); + + { +#define HANDLE_INST(OCNum, OCName, Class) Opcodes.push_back(OCNum); +#include "llvm/IR/Instruction.def" + } + + unsigned FullTripCounts = 0; + for (unsigned OC : Opcodes) { + Intrinsic::ID VPID = VPIntrinsic::GetForOpcode(OC); + // no equivalent VP intrinsic available + if (VPID == Intrinsic::not_intrinsic) + continue; + + unsigned RoundTripOC = VPIntrinsic::GetFunctionalOpcodeForVP(VPID); + // no equivalent Opcode available + if (RoundTripOC == Instruction::Call) + continue; + + ASSERT_EQ(RoundTripOC, OC); + ++FullTripCounts; + } + ASSERT_NE(FullTripCounts, 0u); +} + +} // end anonymous namespace diff --git a/llvm/unittests/Support/Path.cpp b/llvm/unittests/Support/Path.cpp index 601223b11ab4b7..affba601a8bc76 100644 --- a/llvm/unittests/Support/Path.cpp +++ b/llvm/unittests/Support/Path.cpp @@ -28,6 +28,7 @@ #ifdef _WIN32 #include "llvm/ADT/ArrayRef.h" #include "llvm/Support/Chrono.h" +#include "llvm/Support/Windows/WindowsSupport.h" #include #include #endif @@ -1875,4 +1876,74 @@ TEST_F(FileSystemTest, permissions) { #endif } +#ifdef _WIN32 +TEST_F(FileSystemTest, widenPath) { + const std::wstring LongPathPrefix(L"\\\\?\\"); + + // Test that the length limit is checked against the UTF-16 length and not the + // UTF-8 length. + std::string Input("C:\\foldername\\"); + const std::string Pi("\xcf\x80"); // UTF-8 lower case pi. + // Add Pi up to the MAX_PATH limit. + const size_t NumChars = MAX_PATH - Input.size() - 1; + for (size_t i = 0; i < NumChars; ++i) + Input += Pi; + // Check that UTF-8 length already exceeds MAX_PATH. + EXPECT_TRUE(Input.size() > MAX_PATH); + SmallVector Result; + ASSERT_NO_ERROR(windows::widenPath(Input, Result)); + // Result should not start with the long path prefix. + EXPECT_TRUE(std::wmemcmp(Result.data(), LongPathPrefix.c_str(), + LongPathPrefix.size()) != 0); + EXPECT_EQ(Result.size(), MAX_PATH - 1); + + // Add another Pi to exceed the MAX_PATH limit. + Input += Pi; + // Construct the expected result. + SmallVector Expected; + ASSERT_NO_ERROR(windows::UTF8ToUTF16(Input, Expected)); + Expected.insert(Expected.begin(), LongPathPrefix.begin(), + LongPathPrefix.end()); + + ASSERT_NO_ERROR(windows::widenPath(Input, Result)); + EXPECT_EQ(Result, Expected); + + // Test that UNC paths are handled correctly. + const std::string ShareName("\\\\sharename\\"); + const std::string FileName("\\filename"); + // Initialize directory name so that the input is within the MAX_PATH limit. + const char DirChar = 'x'; + std::string DirName(MAX_PATH - ShareName.size() - FileName.size() - 1, + DirChar); + + Input = ShareName + DirName + FileName; + ASSERT_NO_ERROR(windows::widenPath(Input, Result)); + // Result should not start with the long path prefix. + EXPECT_TRUE(std::wmemcmp(Result.data(), LongPathPrefix.c_str(), + LongPathPrefix.size()) != 0); + EXPECT_EQ(Result.size(), MAX_PATH - 1); + + // Extend the directory name so the input exceeds the MAX_PATH limit. + DirName += DirChar; + Input = ShareName + DirName + FileName; + // Construct the expected result. + ASSERT_NO_ERROR(windows::UTF8ToUTF16(StringRef(Input).substr(2), Expected)); + const std::wstring UNCPrefix(LongPathPrefix + L"UNC\\"); + Expected.insert(Expected.begin(), UNCPrefix.begin(), UNCPrefix.end()); + + ASSERT_NO_ERROR(windows::widenPath(Input, Result)); + EXPECT_EQ(Result, Expected); + + // Check that Unix separators are handled correctly. + std::replace(Input.begin(), Input.end(), '\\', '/'); + ASSERT_NO_ERROR(windows::widenPath(Input, Result)); + EXPECT_EQ(Result, Expected); + + // Check the removal of "dots". + Input = ShareName + DirName + "\\.\\foo\\.\\.." + FileName; + ASSERT_NO_ERROR(windows::widenPath(Input, Result)); + EXPECT_EQ(Result, Expected); +} +#endif + } // anonymous namespace diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp index 3a25620f744a6f..5269a9a17543fd 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp @@ -10,6 +10,7 @@ #include "../lib/Transforms/Vectorize/VPlanTransforms.h" #include "VPlanTestBase.h" #include "gtest/gtest.h" +#include namespace llvm { namespace { @@ -88,6 +89,45 @@ TEST_F(VPlanHCFGTest, testBuildHCFGInnerLoop) { EXPECT_EQ(IndvarAdd, ICmp->getOperand(0)); EXPECT_EQ(VecBB->getCondBit(), ICmp); + // Add an external value to check we do not print the list of external values, + // as this is not required with the new printing. + Plan->addVPValue(&*F->arg_begin()); + std::string FullDump; + raw_string_ostream(FullDump) << *Plan; + EXPECT_EQ(R"(digraph VPlan { +graph [labelloc=t, fontsize=30; label="Vectorization Plan"] +node [shape=rect, fontname=Courier, fontsize=30] +edge [fontname=Courier, fontsize=30] +compound=true + subgraph cluster_N0 { + fontname=Courier + label="\ TopRegion" + N1 [label = + "entry:\n" + ] + N1 -> N2 [ label=""] + N2 [label = + "for.body:\n" + + "EMIT ir<%indvars.iv> = phi ir<0> ir<%indvars.iv.next>\l" + + "EMIT ir<%arr.idx> = getelementptr ir<%A> ir<%indvars.iv>\l" + + "EMIT ir<%l1> = load ir<%arr.idx>\l" + + "EMIT ir<%res> = add ir<%l1> ir<10>\l" + + "EMIT store ir<%res> ir<%arr.idx>\l" + + "EMIT ir<%indvars.iv.next> = add ir<%indvars.iv> ir<1>\l" + + "EMIT ir<%exitcond> = icmp ir<%indvars.iv.next> ir<%N>\l" + + "CondBit: ir<%exitcond> (for.body)\l" + ] + N2 -> N2 [ label="T"] + N2 -> N3 [ label="F"] + N3 [label = + "for.end:\n" + + "EMIT ret\l" + ] + } +} +)", + FullDump); + LoopVectorizationLegality::InductionList Inductions; SmallPtrSet DeadInstructions; VPlanTransforms::VPInstructionsToVPRecipes(LI->getLoopFor(LoopHeader), Plan, diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index 61df1f6288ceba..ce0e0017fa85e3 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -178,17 +178,17 @@ TEST(VPBasicBlockTest, getPlan) { } TEST(VPBasicBlockTest, print) { - VPInstruction *I1 = new VPInstruction(10, {}); - VPInstruction *I2 = new VPInstruction(1, {I1}); - VPInstruction *I3 = new VPInstruction(2, {I1, I2}); + VPInstruction *I1 = new VPInstruction(Instruction::Add, {}); + VPInstruction *I2 = new VPInstruction(Instruction::Sub, {I1}); + VPInstruction *I3 = new VPInstruction(Instruction::Br, {I1, I2}); VPBasicBlock *VPBB1 = new VPBasicBlock(); VPBB1->appendRecipe(I1); VPBB1->appendRecipe(I2); VPBB1->appendRecipe(I3); - VPInstruction *I4 = new VPInstruction(4, {I3, I2}); - VPInstruction *I5 = new VPInstruction(5, {I1}); + VPInstruction *I4 = new VPInstruction(Instruction::Mul, {I2, I1}); + VPInstruction *I5 = new VPInstruction(Instruction::Ret, {I4}); VPBasicBlock *VPBB2 = new VPBasicBlock(); VPBB2->appendRecipe(I4); VPBB2->appendRecipe(I5); @@ -201,7 +201,7 @@ TEST(VPBasicBlockTest, print) { raw_string_ostream OS(I3Dump); I3->print(OS); OS.flush(); - EXPECT_EQ(" = br ", I3Dump); + EXPECT_EQ("br ", I3Dump); } VPlan Plan; @@ -216,15 +216,15 @@ edge [fontname=Courier, fontsize=30] compound=true N0 [label = ":\n" + - "EMIT %vp0 = catchswitch\l" + - "EMIT %vp1 = ret %vp0\l" + - "EMIT %vp2 = br %vp0 %vp1\l" + "EMIT vp<%0> = add\l" + + "EMIT vp<%1> = sub vp<%0>\l" + + "EMIT br vp<%0> vp<%1>\l" ] N0 -> N1 [ label=""] N1 [label = ":\n" + - "EMIT %vp3 = indirectbr %vp2 %vp1\l" + - "EMIT %vp4 = invoke %vp0\l" + "EMIT vp<%2> = mul vp<%1> vp<%0>\l" + + "EMIT ret vp<%2>\l" ] } )", @@ -235,15 +235,15 @@ compound=true raw_string_ostream OS(I3Dump); I3->print(OS); OS.flush(); - EXPECT_EQ("%vp2 = br %vp0 %vp1", I3Dump); + EXPECT_EQ("br vp<%0> vp<%1>", I3Dump); } { - std::string I2Dump; - raw_string_ostream OS(I2Dump); - OS << *I2; + std::string I4Dump; + raw_string_ostream OS(I4Dump); + OS << *I4; OS.flush(); - EXPECT_EQ("%vp1 = ret %vp0", I2Dump); + EXPECT_EQ("vp<%2> = mul vp<%1> vp<%0>", I4Dump); } } diff --git a/llvm/utils/TableGen/CodeGenIntrinsics.h b/llvm/utils/TableGen/CodeGenIntrinsics.h index 723bbe0cc23d8b..824bb944753bf6 100644 --- a/llvm/utils/TableGen/CodeGenIntrinsics.h +++ b/llvm/utils/TableGen/CodeGenIntrinsics.h @@ -123,6 +123,9 @@ struct CodeGenIntrinsic { /// True if the intrinsic is no-return. bool isNoReturn; + /// True if the intrinsic is no-sync. + bool isNoSync; + /// True if the intrinsic is will-return. bool isWillReturn; diff --git a/llvm/utils/TableGen/CodeGenRegisters.cpp b/llvm/utils/TableGen/CodeGenRegisters.cpp index bebee0d685d737..4584bc7cfae324 100644 --- a/llvm/utils/TableGen/CodeGenRegisters.cpp +++ b/llvm/utils/TableGen/CodeGenRegisters.cpp @@ -1275,8 +1275,8 @@ CodeGenRegBank::getOrCreateSubClass(const CodeGenRegisterClass *RC, return &RegClasses.back(); } -CodeGenRegisterClass *CodeGenRegBank::getRegClass(Record *Def) { - if (CodeGenRegisterClass *RC = Def2RC[Def]) +CodeGenRegisterClass *CodeGenRegBank::getRegClass(const Record *Def) const { + if (CodeGenRegisterClass *RC = Def2RC.lookup(Def)) return RC; PrintFatalError(Def->getLoc(), "Not a known RegisterClass!"); diff --git a/llvm/utils/TableGen/CodeGenRegisters.h b/llvm/utils/TableGen/CodeGenRegisters.h index f15138b04b0129..2b200adef31249 100644 --- a/llvm/utils/TableGen/CodeGenRegisters.h +++ b/llvm/utils/TableGen/CodeGenRegisters.h @@ -719,7 +719,7 @@ namespace llvm { } // Find a register class from its def. - CodeGenRegisterClass *getRegClass(Record*); + CodeGenRegisterClass *getRegClass(const Record *) const; /// getRegisterClassForRegister - Find the register class that contains the /// specified physical register. If the register is not in a register diff --git a/llvm/utils/TableGen/CodeGenTarget.cpp b/llvm/utils/TableGen/CodeGenTarget.cpp index de41692c6f45b2..921d20e7af7659 100644 --- a/llvm/utils/TableGen/CodeGenTarget.cpp +++ b/llvm/utils/TableGen/CodeGenTarget.cpp @@ -607,6 +607,7 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) { isCommutative = false; canThrow = false; isNoReturn = false; + isNoSync = false; isWillReturn = false; isCold = false; isNoDuplicate = false; @@ -726,8 +727,7 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) { // variants with iAny types; otherwise, if the intrinsic is not // overloaded, all the types can be specified directly. assert(((!TyEl->isSubClassOf("LLVMExtendedType") && - !TyEl->isSubClassOf("LLVMTruncatedType") && - !TyEl->isSubClassOf("LLVMScalarOrSameVectorWidth")) || + !TyEl->isSubClassOf("LLVMTruncatedType")) || VT == MVT::iAny || VT == MVT::vAny) && "Expected iAny or vAny type"); } else @@ -772,6 +772,8 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) { isConvergent = true; else if (Property->getName() == "IntrNoReturn") isNoReturn = true; + else if (Property->getName() == "IntrNoSync") + isNoSync = true; else if (Property->getName() == "IntrWillReturn") isWillReturn = true; else if (Property->getName() == "IntrCold") diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp index 3ac9cc857f02a3..b0ac385c19390c 100644 --- a/llvm/utils/TableGen/IntrinsicEmitter.cpp +++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp @@ -579,6 +579,9 @@ struct AttributeComparator { if (L->isNoReturn != R->isNoReturn) return R->isNoReturn; + if (L->isNoSync != R->isNoSync) + return R->isNoSync; + if (L->isWillReturn != R->isWillReturn) return R->isWillReturn; @@ -720,8 +723,8 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints, if (!intrinsic.canThrow || (intrinsic.ModRef != CodeGenIntrinsic::ReadWriteMem && !intrinsic.hasSideEffects) || - intrinsic.isNoReturn || intrinsic.isWillReturn || intrinsic.isCold || - intrinsic.isNoDuplicate || intrinsic.isConvergent || + intrinsic.isNoReturn || intrinsic.isNoSync || intrinsic.isWillReturn || + intrinsic.isCold || intrinsic.isNoDuplicate || intrinsic.isConvergent || intrinsic.isSpeculatable) { OS << " const Attribute::AttrKind Atts[] = {"; bool addComma = false; @@ -735,6 +738,12 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints, OS << "Attribute::NoReturn"; addComma = true; } + if (intrinsic.isNoSync) { + if (addComma) + OS << ","; + OS << "Attribute::NoSync"; + addComma = true; + } if (intrinsic.isWillReturn) { if (addComma) OS << ","; diff --git a/llvm/utils/TableGen/RegisterBankEmitter.cpp b/llvm/utils/TableGen/RegisterBankEmitter.cpp index 7f6b3931d3dea8..5d0751d144516b 100644 --- a/llvm/utils/TableGen/RegisterBankEmitter.cpp +++ b/llvm/utils/TableGen/RegisterBankEmitter.cpp @@ -19,6 +19,7 @@ #include "CodeGenHwModes.h" #include "CodeGenRegisters.h" +#include "CodeGenTarget.h" #define DEBUG_TYPE "register-bank-emitter" @@ -36,12 +37,12 @@ class RegisterBank { /// The register classes that are covered by the register bank. RegisterClassesTy RCs; - /// The register class with the largest register size. - const CodeGenRegisterClass *RCWithLargestRegsSize; + /// The register classes with the largest register size for each HwMode. + std::vector RCsWithLargestRegSize; public: - RegisterBank(const Record &TheDef) - : TheDef(TheDef), RCs(), RCWithLargestRegsSize(nullptr) {} + RegisterBank(const Record &TheDef, unsigned NumModeIds) + : TheDef(TheDef), RCs(), RCsWithLargestRegSize(NumModeIds) {} /// Get the human-readable name for the bank. StringRef getName() const { return TheDef.getValueAsString("Name"); } @@ -53,6 +54,10 @@ class RegisterBank { return (TheDef.getName() + "CoverageData").str(); } + std::string getSizesArrayName() const { + return (TheDef.getName() + "Sizes").str(); + } + /// Get the name of the global instance variable. StringRef getInstanceVarName() const { return TheDef.getName(); } @@ -60,10 +65,10 @@ class RegisterBank { /// Get the register classes listed in the RegisterBank.RegisterClasses field. std::vector - getExplictlySpecifiedRegisterClasses( - CodeGenRegBank &RegisterClassHierarchy) const { + getExplicitlySpecifiedRegisterClasses( + const CodeGenRegBank &RegisterClassHierarchy) const { std::vector RCs; - for (const auto &RCDef : getDef().getValueAsListOfDefs("RegisterClasses")) + for (const auto *RCDef : getDef().getValueAsListOfDefs("RegisterClasses")) RCs.push_back(RegisterClassHierarchy.getRegClass(RCDef)); return RCs; } @@ -82,18 +87,20 @@ class RegisterBank { // register size anywhere (we could sum the sizes of the subregisters // but there may be additional bits too) and we can't derive it from // the VT's reliably due to Untyped. - if (RCWithLargestRegsSize == nullptr) - RCWithLargestRegsSize = RC; - else if (RCWithLargestRegsSize->RSI.get(DefaultMode).SpillSize < - RC->RSI.get(DefaultMode).SpillSize) - RCWithLargestRegsSize = RC; - assert(RCWithLargestRegsSize && "RC was nullptr?"); - + unsigned NumModeIds = RCsWithLargestRegSize.size(); + for (unsigned M = 0; M < NumModeIds; ++M) { + if (RCsWithLargestRegSize[M] == nullptr) + RCsWithLargestRegSize[M] = RC; + else if (RCsWithLargestRegSize[M]->RSI.get(M).SpillSize < + RC->RSI.get(M).SpillSize) + RCsWithLargestRegSize[M] = RC; + assert(RCsWithLargestRegSize[M] && "RC was nullptr?"); + } RCs.emplace_back(RC); } - const CodeGenRegisterClass *getRCWithLargestRegsSize() const { - return RCWithLargestRegsSize; + const CodeGenRegisterClass *getRCWithLargestRegsSize(unsigned HwMode) const { + return RCsWithLargestRegSize[HwMode]; } iterator_range @@ -104,8 +111,8 @@ class RegisterBank { class RegisterBankEmitter { private: + CodeGenTarget Target; RecordKeeper &Records; - CodeGenRegBank RegisterClassHierarchy; void emitHeader(raw_ostream &OS, const StringRef TargetName, const std::vector &Banks); @@ -115,8 +122,7 @@ class RegisterBankEmitter { std::vector &Banks); public: - RegisterBankEmitter(RecordKeeper &R) - : Records(R), RegisterClassHierarchy(Records, CodeGenHwModes(R)) {} + RegisterBankEmitter(RecordKeeper &R) : Target(R), Records(R) {} void run(raw_ostream &OS); }; @@ -147,7 +153,7 @@ void RegisterBankEmitter::emitBaseClassDefinition( OS << "private:\n" << " static RegisterBank *RegBanks[];\n\n" << "protected:\n" - << " " << TargetName << "GenRegisterBankInfo();\n" + << " " << TargetName << "GenRegisterBankInfo(unsigned HwMode = 0);\n" << "\n"; } @@ -167,8 +173,8 @@ void RegisterBankEmitter::emitBaseClassDefinition( /// multiple times for a given class if there are multiple paths /// to the class. static void visitRegisterBankClasses( - CodeGenRegBank &RegisterClassHierarchy, const CodeGenRegisterClass *RC, - const Twine Kind, + const CodeGenRegBank &RegisterClassHierarchy, + const CodeGenRegisterClass *RC, const Twine Kind, std::function VisitFn, SmallPtrSetImpl &VisitedRCs) { @@ -212,6 +218,8 @@ static void visitRegisterBankClasses( void RegisterBankEmitter::emitBaseClassImplementation( raw_ostream &OS, StringRef TargetName, std::vector &Banks) { + const CodeGenRegBank &RegisterClassHierarchy = Target.getRegBank(); + const CodeGenHwModes &CGH = Target.getHwModes(); OS << "namespace llvm {\n" << "namespace " << TargetName << " {\n"; @@ -239,14 +247,30 @@ void RegisterBankEmitter::emitBaseClassImplementation( } OS << "\n"; + unsigned NumModeIds = CGH.getNumModeIds(); + for (const auto &Bank : Banks) { + OS << "const unsigned " << Bank.getSizesArrayName() << "[] = {\n"; + for (unsigned M = 0; M < NumModeIds; ++M) { + const CodeGenRegisterClass &RC = *Bank.getRCWithLargestRegsSize(M); + unsigned Size = RC.RSI.get(M).SpillSize; + OS << " // Mode = " << M << " ("; + if (M == 0) + OS << "Default"; + else + OS << CGH.getMode(M).Name; + OS << ")\n"; + OS << " " << Size << ",\n"; + } + OS << "};\n"; + } + OS << "\n"; + for (const auto &Bank : Banks) { std::string QualifiedBankID = (TargetName + "::" + Bank.getEnumeratorName()).str(); - const CodeGenRegisterClass &RC = *Bank.getRCWithLargestRegsSize(); - unsigned Size = RC.RSI.get(DefaultMode).SpillSize; OS << "RegisterBank " << Bank.getInstanceVarName() << "(/* ID */ " << QualifiedBankID << ", /* Name */ \"" << Bank.getName() - << "\", /* Size */ " << Size << ", " + << "\", /* Sizes */ " << Bank.getInstanceVarName() << "Sizes, " << "/* CoveredRegClasses */ " << Bank.getCoverageArrayName() << ", /* NumRegClasses */ " << RegisterClassHierarchy.getRegClasses().size() << ");\n"; @@ -261,9 +285,9 @@ void RegisterBankEmitter::emitBaseClassImplementation( OS << "};\n\n"; OS << TargetName << "GenRegisterBankInfo::" << TargetName - << "GenRegisterBankInfo()\n" + << "GenRegisterBankInfo(unsigned HwMode)\n" << " : RegisterBankInfo(RegBanks, " << TargetName - << "::NumRegisterBanks) {\n" + << "::NumRegisterBanks, HwMode) {\n" << " // Assert that RegBank indices match their ID's\n" << "#ifndef NDEBUG\n" << " unsigned Index = 0;\n" @@ -275,18 +299,17 @@ void RegisterBankEmitter::emitBaseClassImplementation( } void RegisterBankEmitter::run(raw_ostream &OS) { - std::vector Targets = Records.getAllDerivedDefinitions("Target"); - if (Targets.size() != 1) - PrintFatalError("ERROR: Too many or too few subclasses of Target defined!"); - StringRef TargetName = Targets[0]->getName(); + StringRef TargetName = Target.getName(); + const CodeGenRegBank &RegisterClassHierarchy = Target.getRegBank(); + const CodeGenHwModes &CGH = Target.getHwModes(); std::vector Banks; for (const auto &V : Records.getAllDerivedDefinitions("RegisterBank")) { SmallPtrSet VisitedRCs; - RegisterBank Bank(*V); + RegisterBank Bank(*V, CGH.getNumModeIds()); for (const CodeGenRegisterClass *RC : - Bank.getExplictlySpecifiedRegisterClasses(RegisterClassHierarchy)) { + Bank.getExplicitlySpecifiedRegisterClasses(RegisterClassHierarchy)) { visitRegisterBankClasses( RegisterClassHierarchy, RC, "explicit", [&Bank](const CodeGenRegisterClass *RC, StringRef Kind) { @@ -301,14 +324,14 @@ void RegisterBankEmitter::run(raw_ostream &OS) { } // Warn about ambiguous MIR caused by register bank/class name clashes. - for (const auto &Class : Records.getAllDerivedDefinitions("RegisterClass")) { + for (const auto &Class : RegisterClassHierarchy.getRegClasses()) { for (const auto &Bank : Banks) { - if (Bank.getName().lower() == Class->getName().lower()) { + if (Bank.getName().lower() == StringRef(Class.getName()).lower()) { PrintWarning(Bank.getDef().getLoc(), "Register bank names should be " "distinct from register classes " "to avoid ambiguous MIR"); PrintNote(Bank.getDef().getLoc(), "RegisterBank was declared here"); - PrintNote(Class->getLoc(), "RegisterClass was declared here"); + PrintNote(Class.getDef()->getLoc(), "RegisterClass was declared here"); } } } diff --git a/llvm/utils/gn/secondary/clang/utils/TableGen/BUILD.gn b/llvm/utils/gn/secondary/clang/utils/TableGen/BUILD.gn index 34297ecf3b142c..a0e35cc11ef6c0 100644 --- a/llvm/utils/gn/secondary/clang/utils/TableGen/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/utils/TableGen/BUILD.gn @@ -3,11 +3,6 @@ executable("clang-tblgen") { "//llvm/lib/Support", "//llvm/lib/TableGen", ] - - # FIXME: This is incorrect, see https://reviews.llvm.org/D75470#inline-695187 - # Remoe again once that's rectified. - include_dirs = [ "//clang/include" ] - sources = [ "ASTTableGen.cpp", "ClangASTNodesEmitter.cpp", diff --git a/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn index 4e3e776790c20e..92b3f4e37810e3 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn @@ -39,6 +39,7 @@ unittest("IRTests") { "TypesTest.cpp", "UseTest.cpp", "UserTest.cpp", + "VPIntrinsicTest.cpp", "ValueHandleTest.cpp", "ValueMapTest.cpp", "ValueTest.cpp", diff --git a/llvm/utils/lit/lit/Test.py b/llvm/utils/lit/lit/Test.py index 62f1bbf1f03a5c..000bcf8fc38fe6 100644 --- a/llvm/utils/lit/lit/Test.py +++ b/llvm/utils/lit/lit/Test.py @@ -220,6 +220,10 @@ def __init__(self, suite, path_in_suite, config, file_path = None): # triple parts. All of them must be False for the test to run. self.unsupported = [] + # An optional number of retries allowed before the test finally succeeds. + # The test is run at most once plus the number of retries specified here. + self.allowed_retries = getattr(config, 'test_retry_attempts', 0) + # The test result, once complete. self.result = None diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py index 96411c98ee3349..a9518b2b5a0b02 100644 --- a/llvm/utils/lit/lit/TestRunner.py +++ b/llvm/utils/lit/lit/TestRunner.py @@ -1182,13 +1182,15 @@ class ParserKind(object): LIST: A keyword taking a comma-separated list of values. BOOLEAN_EXPR: A keyword taking a comma-separated list of boolean expressions. Ex 'XFAIL:' + INTEGER: A keyword taking a single integer. Ex 'ALLOW_RETRIES:' CUSTOM: A keyword with custom parsing semantics. """ TAG = 0 COMMAND = 1 LIST = 2 BOOLEAN_EXPR = 3 - CUSTOM = 4 + INTEGER = 4 + CUSTOM = 5 @staticmethod def allowedKeywordSuffixes(value): @@ -1196,6 +1198,7 @@ def allowedKeywordSuffixes(value): ParserKind.COMMAND: [':'], ParserKind.LIST: [':'], ParserKind.BOOLEAN_EXPR: [':'], + ParserKind.INTEGER: [':'], ParserKind.CUSTOM: [':', '.'] } [value] @@ -1205,6 +1208,7 @@ def str(value): ParserKind.COMMAND: 'COMMAND', ParserKind.LIST: 'LIST', ParserKind.BOOLEAN_EXPR: 'BOOLEAN_EXPR', + ParserKind.INTEGER: 'INTEGER', ParserKind.CUSTOM: 'CUSTOM' } [value] @@ -1247,6 +1251,8 @@ def __init__(self, keyword, kind, parser=None, initial_value=None): self.parser = self._handleList elif kind == ParserKind.BOOLEAN_EXPR: self.parser = self._handleBooleanExpr + elif kind == ParserKind.INTEGER: + self.parser = self._handleSingleInteger elif kind == ParserKind.TAG: self.parser = self._handleTag elif kind == ParserKind.CUSTOM: @@ -1311,6 +1317,18 @@ def _handleList(line_number, line, output): output.extend([s.strip() for s in line.split(',')]) return output + @staticmethod + def _handleSingleInteger(line_number, line, output): + """A parser for INTEGER type keywords""" + if output is None: + output = [] + try: + n = int(line) + except ValueError: + raise ValueError("INTEGER parser requires the input to be an integer (got {})".format(line)) + output.append(n) + return output + @staticmethod def _handleBooleanExpr(line_number, line, output): """A parser for BOOLEAN_EXPR type keywords""" @@ -1331,8 +1349,8 @@ def _handleBooleanExpr(line_number, line, output): def parseIntegratedTestScript(test, additional_parsers=[], require_script=True): """parseIntegratedTestScript - Scan an LLVM/Clang style integrated test - script and extract the lines to 'RUN' as well as 'XFAIL' and 'REQUIRES' - and 'UNSUPPORTED' information. + script and extract the lines to 'RUN' as well as 'XFAIL', 'REQUIRES', + 'UNSUPPORTED' and 'ALLOW_RETRIES' information. If additional parsers are specified then the test is also scanned for the keywords they specify and all matches are passed to the custom parser. @@ -1353,6 +1371,7 @@ def parseIntegratedTestScript(test, additional_parsers=[], initial_value=test.requires), IntegratedTestKeywordParser('UNSUPPORTED:', ParserKind.BOOLEAN_EXPR, initial_value=test.unsupported), + IntegratedTestKeywordParser('ALLOW_RETRIES:', ParserKind.INTEGER), IntegratedTestKeywordParser('END.', ParserKind.TAG) ] keyword_parsers = {p.keyword: p for p in builtin_parsers} @@ -1412,6 +1431,14 @@ def parseIntegratedTestScript(test, additional_parsers=[], "Test does not support the following features " "and/or targets: %s" % msg) + # Handle ALLOW_RETRIES: + allowed_retries = keyword_parsers['ALLOW_RETRIES:'].getValue() + if allowed_retries: + if len(allowed_retries) > 1: + return lit.Test.Result(Test.UNRESOLVED, + "Test has more than one ALLOW_RETRIES lines") + test.allowed_retries = allowed_retries[0] + # Enforce limit_to_features. if not test.isWithinFeatureLimits(): msg = ', '.join(test.config.limit_to_features) @@ -1477,10 +1504,8 @@ def executeShTest(test, litConfig, useExternalSh, normalize_slashes=useExternalSh) script = applySubstitutions(script, substitutions) - # Re-run failed tests up to test_retry_attempts times. - attempts = 1 - if hasattr(test.config, 'test_retry_attempts'): - attempts += test.config.test_retry_attempts + # Re-run failed tests up to test.allowed_retries times. + attempts = test.allowed_retries + 1 for i in range(attempts): res = _runShTest(test, litConfig, useExternalSh, script, tmpBase) if res.code != Test.FAIL: diff --git a/llvm/utils/lit/tests/Inputs/allow-retries/does-not-succeed-within-limit.py b/llvm/utils/lit/tests/Inputs/allow-retries/does-not-succeed-within-limit.py new file mode 100644 index 00000000000000..05e3f35b6f81e3 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/allow-retries/does-not-succeed-within-limit.py @@ -0,0 +1,3 @@ +# ALLOW_RETRIES: 3 + +# RUN: false diff --git a/llvm/utils/lit/tests/Inputs/allow-retries/lit.cfg b/llvm/utils/lit/tests/Inputs/allow-retries/lit.cfg new file mode 100644 index 00000000000000..eed69f389ed07f --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/allow-retries/lit.cfg @@ -0,0 +1,9 @@ +import lit.formats +config.name = 'allow-retries' +config.suffixes = ['.py'] +config.test_format = lit.formats.ShTest() +config.test_source_root = None +config.test_exec_root = None + +config.substitutions.append(('%python', lit_config.params.get('python', ''))) +config.substitutions.append(('%counter', lit_config.params.get('counter', ''))) diff --git a/llvm/utils/lit/tests/Inputs/allow-retries/more-than-one-allow-retries-lines.py b/llvm/utils/lit/tests/Inputs/allow-retries/more-than-one-allow-retries-lines.py new file mode 100644 index 00000000000000..14fb6b26661a5e --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/allow-retries/more-than-one-allow-retries-lines.py @@ -0,0 +1,4 @@ +# ALLOW_RETRIES: 3 +# ALLOW_RETRIES: 5 + +# RUN: true diff --git a/llvm/utils/lit/tests/Inputs/allow-retries/not-a-valid-integer.py b/llvm/utils/lit/tests/Inputs/allow-retries/not-a-valid-integer.py new file mode 100644 index 00000000000000..d624de900b7f07 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/allow-retries/not-a-valid-integer.py @@ -0,0 +1,3 @@ +# ALLOW_RETRIES: not-an-integer + +# RUN: true diff --git a/llvm/utils/lit/tests/Inputs/allow-retries/succeeds-within-limit.py b/llvm/utils/lit/tests/Inputs/allow-retries/succeeds-within-limit.py new file mode 100644 index 00000000000000..45ac9433fc7efd --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/allow-retries/succeeds-within-limit.py @@ -0,0 +1,24 @@ +# ALLOW_RETRIES: 5 + +# RUN: "%python" "%s" "%counter" + +import sys +import os + +counter_file = sys.argv[1] + +# The first time the test is run, initialize the counter to 1. +if not os.path.exists(counter_file): + with open(counter_file, 'w') as counter: + counter.write("1") + +# Succeed if this is the fourth time we're being run. +with open(counter_file, 'r') as counter: + num = int(counter.read()) + if num == 4: + sys.exit(0) + +# Otherwise, increment the counter and fail +with open(counter_file, 'w') as counter: + counter.write(str(num + 1)) + sys.exit(1) diff --git a/llvm/utils/lit/tests/Inputs/test_retry_attempts/lit.cfg b/llvm/utils/lit/tests/Inputs/test_retry_attempts/lit.cfg new file mode 100644 index 00000000000000..a3b660fbaef327 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/test_retry_attempts/lit.cfg @@ -0,0 +1,10 @@ +import lit.formats +config.name = 'test_retry_attempts' +config.suffixes = ['.py'] +config.test_format = lit.formats.ShTest() +config.test_source_root = None +config.test_exec_root = None + +config.test_retry_attempts = 5 +config.substitutions.append(('%python', lit_config.params.get('python', ''))) +config.substitutions.append(('%counter', lit_config.params.get('counter', ''))) diff --git a/llvm/utils/lit/tests/Inputs/test_retry_attempts/test.py b/llvm/utils/lit/tests/Inputs/test_retry_attempts/test.py new file mode 100644 index 00000000000000..ee8a92cc5d8ff2 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/test_retry_attempts/test.py @@ -0,0 +1,22 @@ +# RUN: "%python" "%s" "%counter" + +import sys +import os + +counter_file = sys.argv[1] + +# The first time the test is run, initialize the counter to 1. +if not os.path.exists(counter_file): + with open(counter_file, 'w') as counter: + counter.write("1") + +# Succeed if this is the fourth time we're being run. +with open(counter_file, 'r') as counter: + num = int(counter.read()) + if num == 4: + sys.exit(0) + +# Otherwise, increment the counter and fail +with open(counter_file, 'w') as counter: + counter.write(str(num + 1)) + sys.exit(1) diff --git a/llvm/utils/lit/tests/Inputs/testrunner-custom-parsers/test.txt b/llvm/utils/lit/tests/Inputs/testrunner-custom-parsers/test.txt index e28060320a2e0d..5809af5477ceb4 100644 --- a/llvm/utils/lit/tests/Inputs/testrunner-custom-parsers/test.txt +++ b/llvm/utils/lit/tests/Inputs/testrunner-custom-parsers/test.txt @@ -13,6 +13,9 @@ // MY_BOOL: b) // MY_BOOL: d // +// MY_INT: 4 +// MY_INT: 6 +// // MY_BOOL_UNTERMINATED: a \ // // END. diff --git a/llvm/utils/lit/tests/allow-retries.py b/llvm/utils/lit/tests/allow-retries.py new file mode 100644 index 00000000000000..3f6cf8f1faa568 --- /dev/null +++ b/llvm/utils/lit/tests/allow-retries.py @@ -0,0 +1,41 @@ +# Check the behavior of the ALLOW_RETRIES keyword. + +# This test uses a file that's stable across retries of the test to fail and +# only succeed the fourth time it is retried. +# +# RUN: rm -f %t.counter +# RUN: %{lit} -j 1 %{inputs}/allow-retries/succeeds-within-limit.py -Dcounter=%t.counter -Dpython=%{python} | FileCheck --check-prefix=CHECK-TEST1 %s +# CHECK-TEST1: Passes With Retry : 1 + +# Test that a per-file ALLOW_RETRIES overwrites the config-wide test_retry_attempts property, if any. +# +# RUN: rm -f %t.counter +# RUN: %{lit} -j 1 %{inputs}/allow-retries/succeeds-within-limit.py -Dtest_retry_attempts=2 -Dcounter=%t.counter -Dpython=%{python} | FileCheck --check-prefix=CHECK-TEST2 %s +# CHECK-TEST2: Passes With Retry : 1 + +# This test does not succeed within the allowed retry limit +# +# RUN: not %{lit} -j 1 %{inputs}/allow-retries/does-not-succeed-within-limit.py | FileCheck --check-prefix=CHECK-TEST3 %s +# CHECK-TEST3: Failing Tests (1): +# CHECK-TEST3: allow-retries :: does-not-succeed-within-limit.py + +# This test should be UNRESOLVED since it has more than one ALLOW_RETRIES +# lines, and that is not allowed. +# +# RUN: not %{lit} -j 1 %{inputs}/allow-retries/more-than-one-allow-retries-lines.py | FileCheck --check-prefix=CHECK-TEST4 %s +# CHECK-TEST4: Unresolved Tests (1): +# CHECK-TEST4: allow-retries :: more-than-one-allow-retries-lines.py + +# This test does not provide a valid integer to the ALLOW_RETRIES keyword. +# It should be unresolved. +# +# RUN: not %{lit} -j 1 %{inputs}/allow-retries/not-a-valid-integer.py | FileCheck --check-prefix=CHECK-TEST5 %s +# CHECK-TEST5: Unresolved Tests (1): +# CHECK-TEST5: allow-retries :: not-a-valid-integer.py + +# This test checks that the config-wide test_retry_attempts property is used +# when no ALLOW_RETRIES keyword is present. +# +# RUN: rm -f %t.counter +# RUN: %{lit} -j 1 %{inputs}/test_retry_attempts/test.py -Dcounter=%t.counter -Dpython=%{python} | FileCheck --check-prefix=CHECK-TEST6 %s +# CHECK-TEST6: Passes With Retry : 1 diff --git a/llvm/utils/lit/tests/unit/TestRunner.py b/llvm/utils/lit/tests/unit/TestRunner.py index ceb7bef34f6a8d..4f33fce648850a 100644 --- a/llvm/utils/lit/tests/unit/TestRunner.py +++ b/llvm/utils/lit/tests/unit/TestRunner.py @@ -57,6 +57,7 @@ def custom_parse(line_number, line, output): IntegratedTestKeywordParser("MY_DNE_TAG.", ParserKind.TAG), IntegratedTestKeywordParser("MY_LIST:", ParserKind.LIST), IntegratedTestKeywordParser("MY_BOOL:", ParserKind.BOOLEAN_EXPR), + IntegratedTestKeywordParser("MY_INT:", ParserKind.INTEGER), IntegratedTestKeywordParser("MY_RUN:", ParserKind.COMMAND), IntegratedTestKeywordParser("MY_CUSTOM:", ParserKind.CUSTOM, custom_parse), @@ -112,6 +113,17 @@ def test_boolean(self): self.assertEqual(value[0].strip(), "a && (b)") self.assertEqual(value[1].strip(), "d") + def test_integer(self): + parsers = self.make_parsers() + self.parse_test(parsers) + int_parser = self.get_parser(parsers, 'MY_INT:') + value = int_parser.getValue() + self.assertEqual(len(value), 2) # there are only two MY_INT: lines + self.assertEqual(type(value[0]), int) + self.assertEqual(value[0], 4) + self.assertEqual(type(value[1]), int) + self.assertEqual(value[1], 6) + def test_boolean_unterminated(self): parsers = self.make_parsers() + \ [IntegratedTestKeywordParser("MY_BOOL_UNTERMINATED:", ParserKind.BOOLEAN_EXPR)] diff --git a/llvm/utils/vim/syntax/llvm.vim b/llvm/utils/vim/syntax/llvm.vim index 487a37b4b86bac..0a661e82d86a97 100644 --- a/llvm/utils/vim/syntax/llvm.vim +++ b/llvm/utils/vim/syntax/llvm.vim @@ -203,6 +203,7 @@ syn match llvmConstant /\/ syn match llvmSpecialComment /;\s*PR\d*\s*$/ syn match llvmSpecialComment /;\s*REQUIRES:.*$/ syn match llvmSpecialComment /;\s*RUN:.*$/ +syn match llvmSpecialComment /;\s*ALLOW_RETRIES:.*$/ syn match llvmSpecialComment /;\s*CHECK:.*$/ syn match llvmSpecialComment "\v;\s*CHECK-(NEXT|NOT|DAG|SAME|LABEL):.*$" syn match llvmSpecialComment /;\s*XFAIL:.*$/ diff --git a/llvm/utils/vscode/llvm/syntaxes/ll.tmLanguage.yaml b/llvm/utils/vscode/llvm/syntaxes/ll.tmLanguage.yaml index 9765cee98df814..117ec134d5738e 100644 --- a/llvm/utils/vscode/llvm/syntaxes/ll.tmLanguage.yaml +++ b/llvm/utils/vscode/llvm/syntaxes/ll.tmLanguage.yaml @@ -319,6 +319,8 @@ patterns: name: string.regexp - match: ";\\s*RUN:.*$" name: string.regexp + - match: ";\\s*ALLOW_RETRIES:.*$" + name: string.regexp - match: ";\\s*CHECK:.*$" name: string.regexp - match: ";\\s*CHECK-(NEXT|NOT|DAG|SAME|LABEL):.*$" diff --git a/mlir/examples/toy/Ch4/toyc.cpp b/mlir/examples/toy/Ch4/toyc.cpp index 51ff330cce677f..6a89d0006239f7 100644 --- a/mlir/examples/toy/Ch4/toyc.cpp +++ b/mlir/examples/toy/Ch4/toyc.cpp @@ -119,7 +119,6 @@ int dumpMLIR() { // Inline all functions into main and then delete them. pm.addPass(mlir::createInlinerPass()); - pm.addPass(mlir::createSymbolDCEPass()); // Now that there is only one function, we can infer the shapes of each of // the operations. diff --git a/mlir/examples/toy/Ch5/toyc.cpp b/mlir/examples/toy/Ch5/toyc.cpp index c9a52c606b213f..4bc2af2ff899ec 100644 --- a/mlir/examples/toy/Ch5/toyc.cpp +++ b/mlir/examples/toy/Ch5/toyc.cpp @@ -125,7 +125,6 @@ int dumpMLIR() { if (enableOpt || isLoweringToAffine) { // Inline all functions into main and then delete them. pm.addPass(mlir::createInlinerPass()); - pm.addPass(mlir::createSymbolDCEPass()); // Now that there is only one function, we can infer the shapes of each of // the operations. diff --git a/mlir/examples/toy/Ch6/toyc.cpp b/mlir/examples/toy/Ch6/toyc.cpp index 3c54f731ff4297..558141c2ca8905 100644 --- a/mlir/examples/toy/Ch6/toyc.cpp +++ b/mlir/examples/toy/Ch6/toyc.cpp @@ -139,7 +139,6 @@ int loadAndProcessMLIR(mlir::MLIRContext &context, if (enableOpt || isLoweringToAffine) { // Inline all functions into main and then delete them. pm.addPass(mlir::createInlinerPass()); - pm.addPass(mlir::createSymbolDCEPass()); // Now that there is only one function, we can infer the shapes of each of // the operations. diff --git a/mlir/examples/toy/Ch7/toyc.cpp b/mlir/examples/toy/Ch7/toyc.cpp index 1f5f988caca3c4..f3e12fb986cf0b 100644 --- a/mlir/examples/toy/Ch7/toyc.cpp +++ b/mlir/examples/toy/Ch7/toyc.cpp @@ -139,7 +139,6 @@ int loadAndProcessMLIR(mlir::MLIRContext &context, if (enableOpt || isLoweringToAffine) { // Inline all functions into main and then delete them. pm.addPass(mlir::createInlinerPass()); - pm.addPass(mlir::createSymbolDCEPass()); // Now that there is only one function, we can infer the shapes of each of // the operations. diff --git a/mlir/include/mlir/Analysis/CallGraph.h b/mlir/include/mlir/Analysis/CallGraph.h index cd25151da4c076..b4ef04969b5af6 100644 --- a/mlir/include/mlir/Analysis/CallGraph.h +++ b/mlir/include/mlir/Analysis/CallGraph.h @@ -192,6 +192,9 @@ class CallGraph { /// external node if a valid node was not resolved. CallGraphNode *resolveCallable(CallOpInterface call) const; + /// Erase the given node from the callgraph. + void eraseNode(CallGraphNode *node); + /// An iterator over the nodes of the graph. using iterator = NodeIterator; iterator begin() const { return nodes.begin(); } diff --git a/mlir/include/mlir/Conversion/StandardToStandard/StandardToStandard.h b/mlir/include/mlir/Conversion/StandardToStandard/StandardToStandard.h new file mode 100644 index 00000000000000..a384d7c22166a7 --- /dev/null +++ b/mlir/include/mlir/Conversion/StandardToStandard/StandardToStandard.h @@ -0,0 +1,31 @@ +//===- StandardToStandard.h - Std intra-dialect conversion -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This files contains patterns for lowering within the Standard dialect. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_CONVERSION_STANDARDTOSTANDARD_STANDARDTOSTANDARD_H_ +#define MLIR_CONVERSION_STANDARDTOSTANDARD_STANDARDTOSTANDARD_H_ + +namespace mlir { + +// Forward declarations. +class MLIRContext; +class OwningRewritePatternList; +class TypeConverter; + +/// Add a pattern to the given pattern list to convert the operand and result +/// types of a CallOp with the given type converter. +void populateCallOpTypeConversionPattern(OwningRewritePatternList &patterns, + MLIRContext *ctx, + TypeConverter &converter); + +} // end namespace mlir + +#endif // MLIR_CONVERSION_STANDARDTOSTANDARD_STANDARDTOSTANDARD_H_ diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td index c495dfd7c90388..53dea5bec65a59 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td @@ -506,6 +506,18 @@ def LLVM_ReturnOp : LLVM_TerminatorOp<"return", [NoSideEffect]>, let parser = [{ return parseReturnOp(parser, result); }]; let printer = [{ printReturnOp(p, *this); }]; } +def LLVM_ResumeOp : LLVM_TerminatorOp<"resume", []> { + let arguments = (ins LLVM_Type:$value); + string llvmBuilder = [{ builder.CreateResume($value); }]; + let verifier = [{ + if (!isa_and_nonnull(value().getDefiningOp())) + return emitOpError("expects landingpad value as operand"); + // No check for personality of function - landingpad op verifies it. + return success(); + }]; + + let assemblyFormat = "$value attr-dict `:` type($value)"; +} def LLVM_UnreachableOp : LLVM_TerminatorOp<"unreachable", []> { string llvmBuilder = [{ builder.CreateUnreachable(); }]; let parser = [{ return success(); }]; @@ -650,7 +662,8 @@ def LLVM_GlobalOp def LLVM_LLVMFuncOp : LLVM_ZeroResultOp<"func", [IsolatedFromAbove, FunctionLike, Symbol]>, Arguments<(ins DefaultValuedAttr:$linkage)> { + "Linkage::External">:$linkage, + OptionalAttr:$personality)> { let summary = "LLVM dialect function, has wrapped LLVM IR function type"; let regions = (region AnyRegion:$body); @@ -862,6 +875,37 @@ def LLVM_MatrixTranposeOp let assemblyFormat = "$matrix attr-dict `:` type($matrix) `into` type($res)"; } +// +// LLVM masked operations. +// + +/// Create a call to Masked Load intrinsic. +def LLVM_MaskedLoadOp + : LLVM_OneResultOp<"intr.masked.load">, + Arguments<(ins LLVM_Type:$data, LLVM_Type:$mask, + Variadic:$pass_thru, I32Attr:$alignment)> { + string llvmBuilder = [{ + $res = $pass_thru.empty() ? builder.CreateMaskedLoad( + $data, llvm::Align($alignment.getZExtValue()), $mask) : + builder.CreateMaskedLoad( + $data, llvm::Align($alignment.getZExtValue()), $mask, $pass_thru[0]); + }]; + let assemblyFormat = + "operands attr-dict `:` functional-type(operands, results)"; +} + +/// Create a call to Masked Store intrinsic. +def LLVM_MaskedStoreOp + : LLVM_ZeroResultOp<"intr.masked.store">, + Arguments<(ins LLVM_Type:$value, LLVM_Type:$data, LLVM_Type:$mask, + I32Attr:$alignment)> { + string llvmBuilder = [{ + builder.CreateMaskedStore( + $value, $data, llvm::Align($alignment.getZExtValue()), $mask); + }]; + let assemblyFormat = "$value `,` $data `,` $mask attr-dict `:` " + "type($value) `,` type($mask) `into` type($data)"; +} // // Atomic operations. // diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td index 26d8f1401c32eb..b6715dc9fcd7ad 100644 --- a/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td +++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td @@ -52,16 +52,6 @@ def SPIRV_Dialect : Dialect { let hasRegionResultAttrVerify = 1; let extraClassDeclaration = [{ - //===------------------------------------------------------------------===// - // Type - //===------------------------------------------------------------------===// - - /// Checks if the given `type` is valid in SPIR-V dialect. - static bool isValidType(Type type); - - /// Checks if the given `scalar type` is valid in SPIR-V dialect. - static bool isValidScalarType(Type type); - //===------------------------------------------------------------------===// // Attribute //===------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.h b/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.h index 85b42eeea29151..ba0b7ea0714cf6 100644 --- a/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.h +++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.h @@ -13,6 +13,7 @@ #ifndef MLIR_DIALECT_SPIRV_SPIRVLOWERING_H #define MLIR_DIALECT_SPIRV_SPIRVLOWERING_H +#include "mlir/Dialect/SPIRV/SPIRVAttributes.h" #include "mlir/Dialect/SPIRV/SPIRVTypes.h" #include "mlir/Dialect/SPIRV/TargetAndABI.h" #include "mlir/Transforms/DialectConversion.h" @@ -22,15 +23,38 @@ namespace mlir { /// Type conversion from standard types to SPIR-V types for shader interface. /// -/// For composite types, this converter additionally performs type wrapping to +/// Non-32-bit scalar types require special hardware support that may not exist +/// on all GPUs. This is reflected in SPIR-V as that non-32-bit scalar types +/// require special capabilities or extensions. Right now if a scalar type of a +/// certain bitwidth is not supported in the target environment, we use 32-bit +/// ones unconditionally. This requires the runtime to also feed in data with +/// a matched bitwidth and layout for interface types. The runtime can do that +/// by inspecting the SPIR-V module. +/// +/// For memref types, this converter additionally performs type wrapping to /// satisfy shader interface requirements: shader interface types must be /// pointers to structs. +/// +/// TODO(antiagainst): We might want to introduce a way to control how +/// unsupported bitwidth are handled and explicitly fail if wanted. class SPIRVTypeConverter : public TypeConverter { public: - SPIRVTypeConverter(); + explicit SPIRVTypeConverter(spirv::TargetEnvAttr targetAttr); /// Gets the SPIR-V correspondence for the standard index type. static Type getIndexType(MLIRContext *context); + + /// Returns the corresponding memory space for memref given a SPIR-V storage + /// class. + static unsigned getMemorySpaceForStorageClass(spirv::StorageClass); + + /// Returns the SPIR-V storage class given a memory space for memref. Return + /// llvm::None if the memory space does not map to any SPIR-V storage class. + static Optional + getStorageClassForMemorySpace(unsigned space); + +private: + spirv::TargetEnv targetEnv; }; /// Base class to define a conversion pattern to lower `SourceOp` into SPIR-V. @@ -61,11 +85,10 @@ class FuncOp; class SPIRVConversionTarget : public ConversionTarget { public: /// Creates a SPIR-V conversion target for the given target environment. - static std::unique_ptr get(TargetEnvAttr targetEnv, - MLIRContext *context); + static std::unique_ptr get(TargetEnvAttr targetAttr); private: - SPIRVConversionTarget(TargetEnvAttr targetEnv, MLIRContext *context); + explicit SPIRVConversionTarget(TargetEnvAttr targetAttr); // Be explicit that instance of this class cannot be copied or moved: there // are lambdas capturing fields of the instance. @@ -78,9 +101,7 @@ class SPIRVConversionTarget : public ConversionTarget { /// environment. bool isLegalOp(Operation *op); - Version givenVersion; /// SPIR-V version to target - llvm::SmallSet givenExtensions; /// Allowed extensions - llvm::SmallSet givenCapabilities; /// Allowed capabilities + TargetEnv targetEnv; }; /// Returns the value for the given `builtin` variable. This function gets or diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVTypes.h b/mlir/include/mlir/Dialect/SPIRV/SPIRVTypes.h index 385e79a0445eb4..85b35f73f82c5e 100644 --- a/mlir/include/mlir/Dialect/SPIRV/SPIRVTypes.h +++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVTypes.h @@ -78,6 +78,8 @@ class SPIRVType : public Type { static bool classof(Type type); + bool isScalarOrVector(); + /// The extension requirements for each type are following the /// ((Extension::A OR Extension::B) AND (Extension::C OR Extension::D)) /// convention. @@ -109,6 +111,11 @@ class ScalarType : public SPIRVType { static bool classof(Type type); + /// Returns true if the given integer type is valid for the SPIR-V dialect. + static bool isValid(FloatType); + /// Returns true if the given float type is valid for the SPIR-V dialect. + static bool isValid(IntegerType); + void getExtensions(SPIRVType::ExtensionArrayRefVector &extensions, Optional storage = llvm::None); void getCapabilities(SPIRVType::CapabilityArrayRefVector &capabilities, @@ -122,6 +129,9 @@ class CompositeType : public SPIRVType { static bool classof(Type type); + /// Returns true if the given vector type is valid for the SPIR-V dialect. + static bool isValid(VectorType); + unsigned getNumElements() const; Type getElementType(unsigned) const; diff --git a/mlir/include/mlir/Dialect/SPIRV/TargetAndABI.h b/mlir/include/mlir/Dialect/SPIRV/TargetAndABI.h index 5ffd00c530c6e0..3f14addd9b6bbf 100644 --- a/mlir/include/mlir/Dialect/SPIRV/TargetAndABI.h +++ b/mlir/include/mlir/Dialect/SPIRV/TargetAndABI.h @@ -15,6 +15,7 @@ #include "mlir/Dialect/SPIRV/SPIRVAttributes.h" #include "mlir/Support/LLVM.h" +#include "llvm/ADT/SmallSet.h" namespace mlir { class Operation; @@ -22,13 +23,45 @@ class Operation; namespace spirv { enum class StorageClass : uint32_t; +/// A wrapper class around a spirv::TargetEnvAttr to provide query methods for +/// allowed version/capabilities/extensions. +class TargetEnv { +public: + explicit TargetEnv(TargetEnvAttr targetAttr); + + Version getVersion(); + + /// Returns true if the given capability is allowed. + bool allows(Capability) const; + /// Returns the first allowed one if any of the given capabilities is allowed. + /// Returns llvm::None otherwise. + Optional allows(ArrayRef) const; + + /// Returns true if the given extension is allowed. + bool allows(Extension) const; + /// Returns the first allowed one if any of the given extensions is allowed. + /// Returns llvm::None otherwise. + Optional allows(ArrayRef) const; + + /// Returns the MLIRContext. + MLIRContext *getContext() const; + + /// Allows implicity converting to the underlying spirv::TargetEnvAttr. + operator TargetEnvAttr() const { return targetAttr; } + +private: + TargetEnvAttr targetAttr; + llvm::SmallSet givenExtensions; /// Allowed extensions + llvm::SmallSet givenCapabilities; /// Allowed capabilities +}; + /// Returns the attribute name for specifying argument ABI information. StringRef getInterfaceVarABIAttrName(); /// Gets the InterfaceVarABIAttr given its fields. InterfaceVarABIAttr getInterfaceVarABIAttr(unsigned descriptorSet, unsigned binding, - StorageClass storageClass, + Optional storageClass, MLIRContext *context); /// Returns the attribute name for specifying entry point information. diff --git a/mlir/include/mlir/Dialect/SPIRV/TargetAndABI.td b/mlir/include/mlir/Dialect/SPIRV/TargetAndABI.td index a463f0e8da95f7..5d08aa1f2d7c0e 100644 --- a/mlir/include/mlir/Dialect/SPIRV/TargetAndABI.td +++ b/mlir/include/mlir/Dialect/SPIRV/TargetAndABI.td @@ -32,7 +32,7 @@ include "mlir/Dialect/SPIRV/SPIRVBase.td" def SPV_InterfaceVarABIAttr : StructAttr<"InterfaceVarABIAttr", SPIRV_Dialect, [ StructFieldAttr<"descriptor_set", I32Attr>, StructFieldAttr<"binding", I32Attr>, - StructFieldAttr<"storage_class", SPV_StorageClassAttr> + StructFieldAttr<"storage_class", OptionalAttr> ]>; // For entry functions, this attribute specifies information related to entry diff --git a/mlir/include/mlir/IR/Builders.h b/mlir/include/mlir/IR/Builders.h index 3817b35ea7cf1e..5b42132d463ab5 100644 --- a/mlir/include/mlir/IR/Builders.h +++ b/mlir/include/mlir/IR/Builders.h @@ -116,9 +116,16 @@ class Builder { IntegerAttr getSI32IntegerAttr(int32_t value); IntegerAttr getUI32IntegerAttr(uint32_t value); + /// Vector-typed DenseIntElementsAttr getters. `values` must not be empty. DenseIntElementsAttr getI32VectorAttr(ArrayRef values); DenseIntElementsAttr getI64VectorAttr(ArrayRef values); + /// Tensor-typed DenseIntElementsAttr getters. `values` can be empty. + /// These are generally preferable for representing general lists of integers + /// as attributes. + DenseIntElementsAttr getI32TensorAttr(ArrayRef values); + DenseIntElementsAttr getI64TensorAttr(ArrayRef values); + ArrayAttr getAffineMapArrayAttr(ArrayRef values); ArrayAttr getI32ArrayAttr(ArrayRef values); ArrayAttr getI64ArrayAttr(ArrayRef values); diff --git a/mlir/include/mlir/IR/Dialect.h b/mlir/include/mlir/IR/Dialect.h index 2f17ad002b30d7..f6506023817196 100644 --- a/mlir/include/mlir/IR/Dialect.h +++ b/mlir/include/mlir/IR/Dialect.h @@ -28,6 +28,7 @@ using DialectConstantFoldHook = std::function, SmallVectorImpl &)>; using DialectExtractElementHook = std::function)>; +using DialectAllocatorFunction = std::function; /// Dialects are groups of MLIR operations and behavior associated with the /// entire group. For example, hooks into other systems for constant folding, @@ -241,24 +242,30 @@ class Dialect { /// A collection of registered dialect interfaces. DenseMap> registeredInterfaces; -}; - -using DialectAllocatorFunction = std::function; - -/// Registers a specific dialect creation function with the system, typically -/// used through the DialectRegistration template. -void registerDialectAllocator(const DialectAllocatorFunction &function); -/// Registers all dialects with the specified MLIRContext. + /// Registers a specific dialect creation function with the global registry. + /// Used through the registerDialect template. + /// Registrations are deduplicated by dialect ClassID and only the first + /// registration will be used. + static void + registerDialectAllocator(const ClassID *classId, + const DialectAllocatorFunction &function); + template + friend void registerDialect(); +}; +/// Registers all dialects and hooks from the global registries with the +/// specified MLIRContext. void registerAllDialects(MLIRContext *context); /// Utility to register a dialect. Client can register their dialect with the /// global registry by calling registerDialect(); template void registerDialect() { - registerDialectAllocator([](MLIRContext *ctx) { - // Just allocate the dialect, the context takes ownership of it. - new ConcreteDialect(ctx); - }); + Dialect::registerDialectAllocator(ClassID::getID(), + [](MLIRContext *ctx) { + // Just allocate the dialect, the context + // takes ownership of it. + new ConcreteDialect(ctx); + }); } /// DialectRegistration provides a global initializer that registers a Dialect diff --git a/mlir/include/mlir/IR/DialectHooks.h b/mlir/include/mlir/IR/DialectHooks.h index 2dce1c2b203a5b..4e59b4953e6560 100644 --- a/mlir/include/mlir/IR/DialectHooks.h +++ b/mlir/include/mlir/IR/DialectHooks.h @@ -35,36 +35,53 @@ class DialectHooks { DialectConstantDecodeHook getDecodeHook() { return nullptr; } // Returns hook to extract an element of an opaque constant tensor. DialectExtractElementHook getExtractElementHook() { return nullptr; } + +private: + /// Registers a function that will set hooks in the registered dialects. + /// Registrations are deduplicated by dialect ClassID and only the first + /// registration will be used. + static void registerDialectHooksSetter(const ClassID *classId, + const DialectHooksSetter &function); + template + friend void registerDialectHooks(StringRef dialectName); }; -/// Registers a function that will set hooks in the registered dialects -/// based on information coming from DialectHooksRegistration. -void registerDialectHooksSetter(const DialectHooksSetter &function); +void registerDialectHooksSetter(const ClassID *classId, + const DialectHooksSetter &function); + +/// Utility to register dialect hooks. Client can register their dialect hooks +/// with the global registry by calling +/// registerDialectHooks("dialect_namespace"); +template +void registerDialectHooks(StringRef dialectName) { + DialectHooks::registerDialectHooksSetter( + ClassID::getID(), [dialectName](MLIRContext *ctx) { + Dialect *dialect = ctx->getRegisteredDialect(dialectName); + if (!dialect) { + llvm::errs() << "error: cannot register hooks for unknown dialect '" + << dialectName << "'\n"; + abort(); + } + // Set hooks. + ConcreteHooks hooks; + if (auto h = hooks.getConstantFoldHook()) + dialect->constantFoldHook = h; + if (auto h = hooks.getDecodeHook()) + dialect->decodeHook = h; + if (auto h = hooks.getExtractElementHook()) + dialect->extractElementHook = h; + }); +} /// DialectHooksRegistration provides a global initializer that registers /// a dialect hooks setter routine. /// Usage: /// /// // At namespace scope. -/// static DialectHooksRegistration unused; +/// static DialectHooksRegistration Unused("dialect_namespace"); template struct DialectHooksRegistration { DialectHooksRegistration(StringRef dialectName) { - registerDialectHooksSetter([dialectName](MLIRContext *ctx) { - Dialect *dialect = ctx->getRegisteredDialect(dialectName); - if (!dialect) { - llvm::errs() << "error: cannot register hooks for unknown dialect '" - << dialectName << "'\n"; - abort(); - } - // Set hooks. - ConcreteHooks hooks; - if (auto h = hooks.getConstantFoldHook()) - dialect->constantFoldHook = h; - if (auto h = hooks.getDecodeHook()) - dialect->decodeHook = h; - if (auto h = hooks.getExtractElementHook()) - dialect->extractElementHook = h; - }); + registerDialectHooks(dialectName); } }; diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h index 9882ce933834b7..bab479bfbb6186 100644 --- a/mlir/include/mlir/IR/PatternMatch.h +++ b/mlir/include/mlir/IR/PatternMatch.h @@ -322,10 +322,15 @@ class PatternRewriter : public OpBuilder { /// why the failure occurred. This method allows for derived rewriters to /// optionally hook into the reason why a pattern failed, and display it to /// users. - virtual LogicalResult - notifyMatchFailure(Operation *op, - function_ref reasonCallback) { + template + std::enable_if_t::value, LogicalResult> + notifyMatchFailure(Operation *op, CallbackT &&reasonCallback) { +#ifndef NDEBUG + return notifyMatchFailure(op, + function_ref(reasonCallback)); +#else return failure(); +#endif } LogicalResult notifyMatchFailure(Operation *op, const Twine &msg) { return notifyMatchFailure(op, [&](Diagnostic &diag) { diag << msg; }); @@ -351,6 +356,17 @@ class PatternRewriter : public OpBuilder { /// uses. virtual void notifyOperationRemoved(Operation *op) {} + /// Notify the pattern rewriter that the pattern is failing to match the given + /// operation, and provide a callback to populate a diagnostic with the reason + /// why the failure occurred. This method allows for derived rewriters to + /// optionally hook into the reason why a pattern failed, and display it to + /// users. + virtual LogicalResult + notifyMatchFailure(Operation *op, + function_ref reasonCallback) { + return failure(); + } + private: /// 'op' and 'newOp' are known to have the same number of results, replace the /// uses of op with uses of newOp. diff --git a/mlir/include/mlir/IR/StandardTypes.h b/mlir/include/mlir/IR/StandardTypes.h index d1c31acb0a51a8..fb1ec07ab5fff7 100644 --- a/mlir/include/mlir/IR/StandardTypes.h +++ b/mlir/include/mlir/IR/StandardTypes.h @@ -342,8 +342,8 @@ class TensorType : public ShapedType { }; /// Ranked tensor types represent multi-dimensional arrays that have a shape -/// with a fixed number of dimensions. Each shape element can be a positive -/// integer or unknown (represented -1). +/// with a fixed number of dimensions. Each shape element can be a non-negative +/// integer or unknown (represented by -1). class RankedTensorType : public Type::TypeBase { diff --git a/mlir/include/mlir/IR/Types.h b/mlir/include/mlir/IR/Types.h index eccc90cdae0c61..e45fa9037470af 100644 --- a/mlir/include/mlir/IR/Types.h +++ b/mlir/include/mlir/IR/Types.h @@ -169,8 +169,11 @@ class Type { /// Return true of this is a signless integer or a float type. bool isSignlessIntOrFloat(); - /// Return true of this is an integer(of any signedness) or a float type. + /// Return true if this is an integer (of any signedness) or a float type. bool isIntOrFloat(); + /// Return true if this is an integer (of any signedness), index, or float + /// type. + bool isIntOrIndexOrFloat(); /// Print the current type. void print(raw_ostream &os); diff --git a/mlir/lib/Analysis/CallGraph.cpp b/mlir/lib/Analysis/CallGraph.cpp index 1a31f13471353c..e31641a87e054f 100644 --- a/mlir/lib/Analysis/CallGraph.cpp +++ b/mlir/lib/Analysis/CallGraph.cpp @@ -143,6 +143,23 @@ CallGraphNode *CallGraph::resolveCallable(CallOpInterface call) const { return getExternalNode(); } +/// Erase the given node from the callgraph. +void CallGraph::eraseNode(CallGraphNode *node) { + // Erase any children of this node first. + if (node->hasChildren()) { + for (const CallGraphNode::Edge &edge : llvm::make_early_inc_range(*node)) + if (edge.isChild()) + eraseNode(edge.getTarget()); + } + // Erase any edges to this node from any other nodes. + for (auto &it : nodes) { + it.second->edges.remove_if([node](const CallGraphNode::Edge &edge) { + return edge.getTarget() == node; + }); + } + nodes.erase(node->getCallableRegion()); +} + //===----------------------------------------------------------------------===// // Printing diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt index 4634345cf43e53..2f1826a1e29911 100644 --- a/mlir/lib/Conversion/CMakeLists.txt +++ b/mlir/lib/Conversion/CMakeLists.txt @@ -10,5 +10,6 @@ add_subdirectory(LoopsToGPU) add_subdirectory(LoopToStandard) add_subdirectory(StandardToLLVM) add_subdirectory(StandardToSPIRV) +add_subdirectory(StandardToStandard) add_subdirectory(VectorToLLVM) add_subdirectory(VectorToLoops) diff --git a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp index 533ef7f53b92c7..5483c2330c20d6 100644 --- a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp +++ b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp @@ -349,10 +349,15 @@ LogicalResult GPUFuncOpConversion::matchAndRewrite( if (!gpu::GPUDialect::isKernel(funcOp)) return failure(); + // TODO(antiagainst): we are dictating the ABI by ourselves here; it should be + // specified outside. SmallVector argABI; - for (auto argNum : llvm::seq(0, funcOp.getNumArguments())) { - argABI.push_back(spirv::getInterfaceVarABIAttr( - 0, argNum, spirv::StorageClass::StorageBuffer, rewriter.getContext())); + for (auto argIndex : llvm::seq(0, funcOp.getNumArguments())) { + Optional sc; + if (funcOp.getArgument(argIndex).getType().isIntOrIndexOrFloat()) + sc = spirv::StorageClass::StorageBuffer; + argABI.push_back( + spirv::getInterfaceVarABIAttr(0, argIndex, sc, rewriter.getContext())); } auto entryPointAttr = spirv::lookupEntryPointABI(funcOp); diff --git a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp index 4b84bc424fbdff..272eb163ab69b9 100644 --- a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp +++ b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp @@ -52,14 +52,15 @@ void GPUToSPIRVPass::runOnModule() { kernelModules.push_back(builder.clone(*moduleOp.getOperation())); }); - SPIRVTypeConverter typeConverter; + auto targetAttr = spirv::lookupTargetEnvOrDefault(module); + std::unique_ptr target = + spirv::SPIRVConversionTarget::get(targetAttr); + + SPIRVTypeConverter typeConverter(targetAttr); OwningRewritePatternList patterns; populateGPUToSPIRVPatterns(context, typeConverter, patterns); populateStandardToSPIRVPatterns(context, typeConverter, patterns); - std::unique_ptr target = spirv::SPIRVConversionTarget::get( - spirv::lookupTargetEnvOrDefault(module), context); - if (failed(applyFullConversion(kernelModules, *target, patterns, &typeConverter))) { return signalPassFailure(); diff --git a/mlir/lib/Conversion/LinalgToSPIRV/LinalgToSPIRVPass.cpp b/mlir/lib/Conversion/LinalgToSPIRV/LinalgToSPIRVPass.cpp index 68d31ca7247943..4477c070796efa 100644 --- a/mlir/lib/Conversion/LinalgToSPIRV/LinalgToSPIRVPass.cpp +++ b/mlir/lib/Conversion/LinalgToSPIRV/LinalgToSPIRVPass.cpp @@ -25,15 +25,15 @@ void LinalgToSPIRVPass::runOnModule() { MLIRContext *context = &getContext(); ModuleOp module = getModule(); - SPIRVTypeConverter typeConverter; + auto targetAttr = spirv::lookupTargetEnvOrDefault(module); + std::unique_ptr target = + spirv::SPIRVConversionTarget::get(targetAttr); + + SPIRVTypeConverter typeConverter(targetAttr); OwningRewritePatternList patterns; populateLinalgToSPIRVPatterns(context, typeConverter, patterns); populateBuiltinFuncToSPIRVPatterns(context, typeConverter, patterns); - auto targetEnv = spirv::lookupTargetEnvOrDefault(module); - std::unique_ptr target = - spirv::SPIRVConversionTarget::get(targetEnv, context); - // Allow builtin ops. target->addLegalOp(); target->addDynamicallyLegalOp( diff --git a/mlir/lib/Conversion/StandardToSPIRV/CMakeLists.txt b/mlir/lib/Conversion/StandardToSPIRV/CMakeLists.txt index 308f1b0074ed94..6d940eaf024e3c 100644 --- a/mlir/lib/Conversion/StandardToSPIRV/CMakeLists.txt +++ b/mlir/lib/Conversion/StandardToSPIRV/CMakeLists.txt @@ -1,7 +1,3 @@ -set(LLVM_TARGET_DEFINITIONS StandardToSPIRV.td) -mlir_tablegen(StandardToSPIRV.cpp.inc -gen-rewriters) -add_public_tablegen_target(MLIRStandardToSPIRVIncGen) - add_mlir_conversion_library(MLIRStandardToSPIRVTransforms ConvertStandardToSPIRV.cpp ConvertStandardToSPIRVPass.cpp @@ -10,9 +6,6 @@ add_mlir_conversion_library(MLIRStandardToSPIRVTransforms ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/SPIRV ${MLIR_MAIN_INCLUDE_DIR}/mlir/IR - - DEPENDS - MLIRStandardToSPIRVIncGen ) target_link_libraries(MLIRStandardToSPIRVTransforms diff --git a/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp b/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp index 310dcd8a86bdb4..69ef69d1de65ac 100644 --- a/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp +++ b/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp @@ -6,52 +6,176 @@ // //===----------------------------------------------------------------------===// // -// This file implements patterns to convert Standard Ops to the SPIR-V dialect. +// This file implements patterns to convert standard ops to SPIR-V ops. // //===----------------------------------------------------------------------===// + #include "mlir/Dialect/SPIRV/LayoutUtils.h" #include "mlir/Dialect/SPIRV/SPIRVDialect.h" #include "mlir/Dialect/SPIRV/SPIRVLowering.h" #include "mlir/Dialect/SPIRV/SPIRVOps.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/AffineMap.h" +#include "mlir/Support/LogicalResult.h" #include "llvm/ADT/SetVector.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "std-to-spirv-pattern" using namespace mlir; +//===----------------------------------------------------------------------===// +// Utility functions +//===----------------------------------------------------------------------===// + +/// Returns true if the given `type` is a boolean scalar or vector type. +static bool isBoolScalarOrVector(Type type) { + if (type.isInteger(1)) + return true; + if (auto vecType = type.dyn_cast()) + return vecType.getElementType().isInteger(1); + return false; +} + +/// Converts the given `srcAttr` into a boolean attribute if it holds a integral +/// value. Returns null attribute if conversion fails. +static BoolAttr convertBoolAttr(Attribute srcAttr, Builder builder) { + if (auto boolAttr = srcAttr.dyn_cast()) + return boolAttr; + if (auto intAttr = srcAttr.dyn_cast()) + return builder.getBoolAttr(intAttr.getValue().getBoolValue()); + return BoolAttr(); +} + +/// Converts the given `srcAttr` to a new attribute of the given `dstType`. +/// Returns null attribute if conversion fails. +static IntegerAttr convertIntegerAttr(IntegerAttr srcAttr, IntegerType dstType, + Builder builder) { + // If the source number uses less active bits than the target bitwidth, then + // it should be safe to convert. + if (srcAttr.getValue().isIntN(dstType.getWidth())) + return builder.getIntegerAttr(dstType, srcAttr.getInt()); + + // XXX: Try again by interpreting the source number as a signed value. + // Although integers in the standard dialect are signless, they can represent + // a signed number. It's the operation decides how to interpret. This is + // dangerous, but it seems there is no good way of handling this if we still + // want to change the bitwidth. Emit a message at least. + if (srcAttr.getValue().isSignedIntN(dstType.getWidth())) { + auto dstAttr = builder.getIntegerAttr(dstType, srcAttr.getInt()); + LLVM_DEBUG(llvm::dbgs() << "attribute '" << srcAttr << "' converted to '" + << dstAttr << "' for type '" << dstType << "'\n"); + return dstAttr; + } + + LLVM_DEBUG(llvm::dbgs() << "attribute '" << srcAttr + << "' illegal: cannot fit into target type '" + << dstType << "'\n"); + return IntegerAttr(); +} + +/// Converts the given `srcAttr` to a new attribute of the given `dstType`. +/// Returns null attribute if `dstType` is not 32-bit or conversion fails. +static FloatAttr convertFloatAttr(FloatAttr srcAttr, FloatType dstType, + Builder builder) { + // Only support converting to float for now. + if (!dstType.isF32()) + return FloatAttr(); + + // Try to convert the source floating-point number to single precision. + APFloat dstVal = srcAttr.getValue(); + bool losesInfo = false; + APFloat::opStatus status = + dstVal.convert(APFloat::IEEEsingle(), APFloat::rmTowardZero, &losesInfo); + if (status != APFloat::opOK || losesInfo) { + LLVM_DEBUG(llvm::dbgs() + << srcAttr << " illegal: cannot fit into converted type '" + << dstType << "'\n"); + return FloatAttr(); + } + + return builder.getF32FloatAttr(dstVal.convertToFloat()); +} + //===----------------------------------------------------------------------===// // Operation conversion //===----------------------------------------------------------------------===// +// Note that DRR cannot be used for the patterns in this file: we may need to +// convert type along the way, which requires ConversionPattern. DRR generates +// normal RewritePattern. + namespace { -/// Convert composite constant operation to SPIR-V dialect. -// TODO(denis0x0D) : move to DRR. -class ConstantCompositeOpConversion final : public SPIRVOpLowering { +/// Converts binary standard operations to SPIR-V operations. +template +class BinaryOpPattern final : public SPIRVOpLowering { +public: + using SPIRVOpLowering::SPIRVOpLowering; + + LogicalResult + matchAndRewrite(StdOp operation, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + assert(operands.size() == 2); + auto dstType = this->typeConverter.convertType(operation.getType()); + if (!dstType) + return failure(); + rewriter.template replaceOpWithNewOp(operation, dstType, operands, + ArrayRef()); + return success(); + } +}; + +/// Converts bitwise standard operations to SPIR-V operations. This is a special +/// pattern other than the BinaryOpPatternPattern because if the operands are +/// boolean values, SPIR-V uses different operations (`SPIRVLogicalOp`). For +/// non-boolean operands, SPIR-V should use `SPIRVBitwiseOp`. +template +class BitwiseOpPattern final : public SPIRVOpLowering { +public: + using SPIRVOpLowering::SPIRVOpLowering; + + LogicalResult + matchAndRewrite(StdOp operation, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + assert(operands.size() == 2); + auto dstType = + this->typeConverter.convertType(operation.getResult().getType()); + if (!dstType) + return failure(); + if (isBoolScalarOrVector(operands.front().getType())) { + rewriter.template replaceOpWithNewOp( + operation, dstType, operands, ArrayRef()); + } else { + rewriter.template replaceOpWithNewOp( + operation, dstType, operands, ArrayRef()); + } + return success(); + } +}; + +/// Converts composite std.constant operation to spv.constant. +class ConstantCompositeOpPattern final : public SPIRVOpLowering { public: using SPIRVOpLowering::SPIRVOpLowering; LogicalResult - matchAndRewrite(ConstantOp constCompositeOp, ArrayRef operands, + matchAndRewrite(ConstantOp constOp, ArrayRef operands, ConversionPatternRewriter &rewriter) const override; }; -/// Convert constant operation with IndexType return to SPIR-V constant -/// operation. Since IndexType is not used within SPIR-V dialect, this needs -/// special handling to make sure the result type and the type of the value -/// attribute are consistent. -// TODO(ravishankarm) : This should be moved into DRR. -class ConstantIndexOpConversion final : public SPIRVOpLowering { +/// Converts scalar std.constant operation to spv.constant. +class ConstantScalarOpPattern final : public SPIRVOpLowering { public: using SPIRVOpLowering::SPIRVOpLowering; LogicalResult - matchAndRewrite(ConstantOp constIndexOp, ArrayRef operands, + matchAndRewrite(ConstantOp constOp, ArrayRef operands, ConversionPatternRewriter &rewriter) const override; }; -/// Convert floating-point comparison operations to SPIR-V dialect. -class CmpFOpConversion final : public SPIRVOpLowering { +/// Converts floating-point comparison operations to SPIR-V ops. +class CmpFOpPattern final : public SPIRVOpLowering { public: using SPIRVOpLowering::SPIRVOpLowering; @@ -60,8 +184,8 @@ class CmpFOpConversion final : public SPIRVOpLowering { ConversionPatternRewriter &rewriter) const override; }; -/// Convert compare operation to SPIR-V dialect. -class CmpIOpConversion final : public SPIRVOpLowering { +/// Converts integer compare operation to SPIR-V ops. +class CmpIOpPattern final : public SPIRVOpLowering { public: using SPIRVOpLowering::SPIRVOpLowering; @@ -70,33 +194,8 @@ class CmpIOpConversion final : public SPIRVOpLowering { ConversionPatternRewriter &rewriter) const override; }; -/// Convert integer binary operations to SPIR-V operations. Cannot use -/// tablegen for this. If the integer operation is on variables of IndexType, -/// the type of the return value of the replacement operation differs from -/// that of the replaced operation. This is not handled in tablegen-based -/// pattern specification. -// TODO(ravishankarm) : This should be moved into DRR. -template -class IntegerOpConversion final : public SPIRVOpLowering { -public: - using SPIRVOpLowering::SPIRVOpLowering; - - LogicalResult - matchAndRewrite(StdOp operation, ArrayRef operands, - ConversionPatternRewriter &rewriter) const override { - auto resultType = - this->typeConverter.convertType(operation.getResult().getType()); - rewriter.template replaceOpWithNewOp( - operation, resultType, operands, ArrayRef()); - return success(); - } -}; - -/// Convert load -> spv.LoadOp. The operands of the replaced operation are of -/// IndexType while that of the replacement operation are of type i32. This is -/// not supported in tablegen based pattern specification. -// TODO(ravishankarm) : This should be moved into DRR. -class LoadOpConversion final : public SPIRVOpLowering { +/// Converts std.load to spv.Load. +class LoadOpPattern final : public SPIRVOpLowering { public: using SPIRVOpLowering::SPIRVOpLowering; @@ -105,9 +204,8 @@ class LoadOpConversion final : public SPIRVOpLowering { ConversionPatternRewriter &rewriter) const override; }; -/// Convert return -> spv.Return. -// TODO(ravishankarm) : This should be moved into DRR. -class ReturnOpConversion final : public SPIRVOpLowering { +/// Converts std.return to spv.Return. +class ReturnOpPattern final : public SPIRVOpLowering { public: using SPIRVOpLowering::SPIRVOpLowering; @@ -116,9 +214,8 @@ class ReturnOpConversion final : public SPIRVOpLowering { ConversionPatternRewriter &rewriter) const override; }; -/// Convert select -> spv.Select -// TODO(ravishankarm) : This should be moved into DRR. -class SelectOpConversion final : public SPIRVOpLowering { +/// Converts std.select to spv.Select. +class SelectOpPattern final : public SPIRVOpLowering { public: using SPIRVOpLowering::SPIRVOpLowering; LogicalResult @@ -126,11 +223,8 @@ class SelectOpConversion final : public SPIRVOpLowering { ConversionPatternRewriter &rewriter) const override; }; -/// Convert store -> spv.StoreOp. The operands of the replaced operation are -/// of IndexType while that of the replacement operation are of type i32. This -/// is not supported in tablegen based pattern specification. -// TODO(ravishankarm) : This should be moved into DRR. -class StoreOpConversion final : public SPIRVOpLowering { +/// Converts std.store to spv.Store. +class StoreOpPattern final : public SPIRVOpLowering { public: using SPIRVOpLowering::SPIRVOpLowering; @@ -139,72 +233,179 @@ class StoreOpConversion final : public SPIRVOpLowering { ConversionPatternRewriter &rewriter) const override; }; +/// Converts type-casting standard operations to SPIR-V operations. +template +class TypeCastingOpPattern final : public SPIRVOpLowering { +public: + using SPIRVOpLowering::SPIRVOpLowering; + + LogicalResult + matchAndRewrite(StdOp operation, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + assert(operands.size() == 1); + auto dstType = + this->typeConverter.convertType(operation.getResult().getType()); + if (dstType == operands.front().getType()) { + // Due to type conversion, we are seeing the same source and target type. + // Then we can just erase this operation by forwarding its operand. + rewriter.replaceOp(operation, operands.front()); + } else { + rewriter.template replaceOpWithNewOp( + operation, dstType, operands, ArrayRef()); + } + return success(); + } +}; + +/// Converts std.xor to SPIR-V operations. +class XOrOpPattern final : public SPIRVOpLowering { +public: + using SPIRVOpLowering::SPIRVOpLowering; + + LogicalResult + matchAndRewrite(XOrOp xorOp, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override; +}; + } // namespace //===----------------------------------------------------------------------===// // ConstantOp with composite type. //===----------------------------------------------------------------------===// -LogicalResult ConstantCompositeOpConversion::matchAndRewrite( - ConstantOp constCompositeOp, ArrayRef operands, +LogicalResult ConstantCompositeOpPattern::matchAndRewrite( + ConstantOp constOp, ArrayRef operands, ConversionPatternRewriter &rewriter) const { - auto compositeType = - constCompositeOp.getResult().getType().dyn_cast(); - if (!compositeType) + auto srcType = constOp.getType().dyn_cast(); + if (!srcType) return failure(); - auto spirvCompositeType = typeConverter.convertType(compositeType); - if (!spirvCompositeType) + // std.constant should only have vector or tenor types. + assert(srcType.isa() || srcType.isa()); + + auto dstType = typeConverter.convertType(srcType); + if (!dstType) return failure(); - auto linearizedElements = - constCompositeOp.value().dyn_cast(); - if (!linearizedElements) + auto dstElementsAttr = constOp.value().dyn_cast(); + ShapedType dstAttrType = dstElementsAttr.getType(); + if (!dstElementsAttr) return failure(); - // If composite type has rank greater than one, then perform linearization. - if (compositeType.getRank() > 1) { - auto linearizedType = RankedTensorType::get(compositeType.getNumElements(), - compositeType.getElementType()); - linearizedElements = linearizedElements.reshape(linearizedType); + // If the composite type has more than one dimensions, perform linearization. + if (srcType.getRank() > 1) { + if (srcType.isa()) { + dstAttrType = RankedTensorType::get(srcType.getNumElements(), + srcType.getElementType()); + dstElementsAttr = dstElementsAttr.reshape(dstAttrType); + } else { + // TODO(antiagainst): add support for large vectors. + return failure(); + } + } + + Type srcElemType = srcType.getElementType(); + Type dstElemType; + // Tensor types are converted to SPIR-V array types; vector types are + // converted to SPIR-V vector/array types. + if (auto arrayType = dstType.dyn_cast()) + dstElemType = arrayType.getElementType(); + else + dstElemType = dstType.cast().getElementType(); + + // If the source and destination element types are different, perform + // attribute conversion. + if (srcElemType != dstElemType) { + SmallVector elements; + if (srcElemType.isa()) { + for (Attribute srcAttr : dstElementsAttr.getAttributeValues()) { + FloatAttr dstAttr = convertFloatAttr( + srcAttr.cast(), dstElemType.cast(), rewriter); + if (!dstAttr) + return failure(); + elements.push_back(dstAttr); + } + } else if (srcElemType.isInteger(1)) { + return failure(); + } else { + for (Attribute srcAttr : dstElementsAttr.getAttributeValues()) { + IntegerAttr dstAttr = + convertIntegerAttr(srcAttr.cast(), + dstElemType.cast(), rewriter); + if (!dstAttr) + return failure(); + elements.push_back(dstAttr); + } + } + + // Unfortunately, we cannot use dialect-specific types for element + // attributes; element attributes only works with standard types. So we need + // to prepare another converted standard types for the destination elements + // attribute. + if (dstAttrType.isa()) + dstAttrType = RankedTensorType::get(dstAttrType.getShape(), dstElemType); + else + dstAttrType = VectorType::get(dstAttrType.getShape(), dstElemType); + + dstElementsAttr = DenseElementsAttr::get(dstAttrType, elements); } - rewriter.replaceOpWithNewOp( - constCompositeOp, spirvCompositeType, linearizedElements); + rewriter.replaceOpWithNewOp(constOp, dstType, + dstElementsAttr); return success(); } //===----------------------------------------------------------------------===// -// ConstantOp with index type. +// ConstantOp with scalar type. //===----------------------------------------------------------------------===// -LogicalResult ConstantIndexOpConversion::matchAndRewrite( - ConstantOp constIndexOp, ArrayRef operands, +LogicalResult ConstantScalarOpPattern::matchAndRewrite( + ConstantOp constOp, ArrayRef operands, ConversionPatternRewriter &rewriter) const { - if (!constIndexOp.getResult().getType().isa()) { + Type srcType = constOp.getType(); + if (!srcType.isIntOrIndexOrFloat()) return failure(); - } - // The attribute has index type which is not directly supported in - // SPIR-V. Get the integer value and create a new IntegerAttr. - auto constAttr = constIndexOp.value().dyn_cast(); - if (!constAttr) { + + Type dstType = typeConverter.convertType(srcType); + if (!dstType) return failure(); + + // Floating-point types. + if (srcType.isa()) { + auto srcAttr = constOp.value().cast(); + auto dstAttr = srcAttr; + + // Floating-point types not supported in the target environment are all + // converted to float type. + if (srcType != dstType) { + dstAttr = convertFloatAttr(srcAttr, dstType.cast(), rewriter); + if (!dstAttr) + return failure(); + } + + rewriter.replaceOpWithNewOp(constOp, dstType, dstAttr); + return success(); } - // Use the bitwidth set in the value attribute to decide the result type - // of the SPIR-V constant operation since SPIR-V does not support index - // types. - auto constVal = constAttr.getValue(); - auto constValType = constAttr.getType().dyn_cast(); - if (!constValType) { - return failure(); + // Bool type. + if (srcType.isInteger(1)) { + // std.constant can use 0/1 instead of true/false for i1 values. We need to + // handle that here. + auto dstAttr = convertBoolAttr(constOp.value(), rewriter); + if (!dstAttr) + return failure(); + rewriter.replaceOpWithNewOp(constOp, dstType, dstAttr); + return success(); } - auto spirvConstType = - typeConverter.convertType(constIndexOp.getResult().getType()); - auto spirvConstVal = - rewriter.getIntegerAttr(spirvConstType, constAttr.getInt()); - rewriter.replaceOpWithNewOp(constIndexOp, spirvConstType, - spirvConstVal); + + // IndexType or IntegerType. Index values are converted to 32-bit integer + // values when converting to SPIR-V. + auto srcAttr = constOp.value().cast(); + auto dstAttr = + convertIntegerAttr(srcAttr, dstType.cast(), rewriter); + if (!dstAttr) + return failure(); + rewriter.replaceOpWithNewOp(constOp, dstType, dstAttr); return success(); } @@ -213,8 +414,8 @@ LogicalResult ConstantIndexOpConversion::matchAndRewrite( //===----------------------------------------------------------------------===// LogicalResult -CmpFOpConversion::matchAndRewrite(CmpFOp cmpFOp, ArrayRef operands, - ConversionPatternRewriter &rewriter) const { +CmpFOpPattern::matchAndRewrite(CmpFOp cmpFOp, ArrayRef operands, + ConversionPatternRewriter &rewriter) const { CmpFOpOperandAdaptor cmpFOpOperands(operands); switch (cmpFOp.getPredicate()) { @@ -253,8 +454,8 @@ CmpFOpConversion::matchAndRewrite(CmpFOp cmpFOp, ArrayRef operands, //===----------------------------------------------------------------------===// LogicalResult -CmpIOpConversion::matchAndRewrite(CmpIOp cmpIOp, ArrayRef operands, - ConversionPatternRewriter &rewriter) const { +CmpIOpPattern::matchAndRewrite(CmpIOp cmpIOp, ArrayRef operands, + ConversionPatternRewriter &rewriter) const { CmpIOpOperandAdaptor cmpIOpOperands(operands); switch (cmpIOp.getPredicate()) { @@ -286,8 +487,8 @@ CmpIOpConversion::matchAndRewrite(CmpIOp cmpIOp, ArrayRef operands, //===----------------------------------------------------------------------===// LogicalResult -LoadOpConversion::matchAndRewrite(LoadOp loadOp, ArrayRef operands, - ConversionPatternRewriter &rewriter) const { +LoadOpPattern::matchAndRewrite(LoadOp loadOp, ArrayRef operands, + ConversionPatternRewriter &rewriter) const { LoadOpOperandAdaptor loadOperands(operands); auto loadPtr = spirv::getElementPtr( typeConverter, loadOp.memref().getType().cast(), @@ -301,8 +502,8 @@ LoadOpConversion::matchAndRewrite(LoadOp loadOp, ArrayRef operands, //===----------------------------------------------------------------------===// LogicalResult -ReturnOpConversion::matchAndRewrite(ReturnOp returnOp, ArrayRef operands, - ConversionPatternRewriter &rewriter) const { +ReturnOpPattern::matchAndRewrite(ReturnOp returnOp, ArrayRef operands, + ConversionPatternRewriter &rewriter) const { if (returnOp.getNumOperands()) { return failure(); } @@ -315,8 +516,8 @@ ReturnOpConversion::matchAndRewrite(ReturnOp returnOp, ArrayRef operands, //===----------------------------------------------------------------------===// LogicalResult -SelectOpConversion::matchAndRewrite(SelectOp op, ArrayRef operands, - ConversionPatternRewriter &rewriter) const { +SelectOpPattern::matchAndRewrite(SelectOp op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const { SelectOpOperandAdaptor selectOperands(operands); rewriter.replaceOpWithNewOp(op, selectOperands.condition(), selectOperands.true_value(), @@ -329,8 +530,8 @@ SelectOpConversion::matchAndRewrite(SelectOp op, ArrayRef operands, //===----------------------------------------------------------------------===// LogicalResult -StoreOpConversion::matchAndRewrite(StoreOp storeOp, ArrayRef operands, - ConversionPatternRewriter &rewriter) const { +StoreOpPattern::matchAndRewrite(StoreOp storeOp, ArrayRef operands, + ConversionPatternRewriter &rewriter) const { StoreOpOperandAdaptor storeOperands(operands); auto storePtr = spirv::getElementPtr( typeConverter, storeOp.memref().getType().cast(), @@ -341,25 +542,58 @@ StoreOpConversion::matchAndRewrite(StoreOp storeOp, ArrayRef operands, return success(); } -namespace { -/// Import the Standard Ops to SPIR-V Patterns. -#include "StandardToSPIRV.cpp.inc" -} // namespace +//===----------------------------------------------------------------------===// +// XorOp +//===----------------------------------------------------------------------===// + +LogicalResult +XOrOpPattern::matchAndRewrite(XOrOp xorOp, ArrayRef operands, + ConversionPatternRewriter &rewriter) const { + assert(operands.size() == 2); + + if (isBoolScalarOrVector(operands.front().getType())) + return failure(); + + auto dstType = typeConverter.convertType(xorOp.getType()); + if (!dstType) + return failure(); + rewriter.replaceOpWithNewOp(xorOp, dstType, operands, + ArrayRef()); + + return success(); +} + +//===----------------------------------------------------------------------===// +// Pattern population +//===----------------------------------------------------------------------===// namespace mlir { void populateStandardToSPIRVPatterns(MLIRContext *context, SPIRVTypeConverter &typeConverter, OwningRewritePatternList &patterns) { - // Add patterns that lower operations into SPIR-V dialect. - populateWithGenerated(context, &patterns); - patterns.insert, - IntegerOpConversion, - IntegerOpConversion, - IntegerOpConversion, - IntegerOpConversion, LoadOpConversion, - ReturnOpConversion, SelectOpConversion, StoreOpConversion>( + patterns.insert< + BinaryOpPattern, + BinaryOpPattern, + BinaryOpPattern, + BinaryOpPattern, + BinaryOpPattern, + BinaryOpPattern, + BinaryOpPattern, + BinaryOpPattern, + BinaryOpPattern, + BinaryOpPattern, + BinaryOpPattern, + BinaryOpPattern, + BinaryOpPattern, + BinaryOpPattern, + BinaryOpPattern, + BitwiseOpPattern, + BitwiseOpPattern, + ConstantCompositeOpPattern, ConstantScalarOpPattern, CmpFOpPattern, + CmpIOpPattern, LoadOpPattern, ReturnOpPattern, SelectOpPattern, + StoreOpPattern, TypeCastingOpPattern, + TypeCastingOpPattern, + TypeCastingOpPattern, XOrOpPattern>( context, typeConverter); } } // namespace mlir diff --git a/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.cpp b/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.cpp index 7a3dae287d702b..efccd168d6ea80 100644 --- a/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.cpp +++ b/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.cpp @@ -31,14 +31,15 @@ void ConvertStandardToSPIRVPass::runOnModule() { MLIRContext *context = &getContext(); ModuleOp module = getModule(); - SPIRVTypeConverter typeConverter; + auto targetAttr = spirv::lookupTargetEnvOrDefault(module); + std::unique_ptr target = + spirv::SPIRVConversionTarget::get(targetAttr); + + SPIRVTypeConverter typeConverter(targetAttr); OwningRewritePatternList patterns; populateStandardToSPIRVPatterns(context, typeConverter, patterns); populateBuiltinFuncToSPIRVPatterns(context, typeConverter, patterns); - std::unique_ptr target = spirv::SPIRVConversionTarget::get( - spirv::lookupTargetEnvOrDefault(module), context); - if (failed(applyPartialConversion(module, *target, patterns))) { return signalPassFailure(); } diff --git a/mlir/lib/Conversion/StandardToSPIRV/StandardToSPIRV.td b/mlir/lib/Conversion/StandardToSPIRV/StandardToSPIRV.td deleted file mode 100644 index a23ae5fe81c9d8..00000000000000 --- a/mlir/lib/Conversion/StandardToSPIRV/StandardToSPIRV.td +++ /dev/null @@ -1,53 +0,0 @@ -//==- StandardToSPIRV.td - Standard Ops to SPIR-V Patterns ---*- tablegen -*==// - -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Defines Patterns to lower standard ops to SPIR-V. -// -//===----------------------------------------------------------------------===// - -#ifndef MLIR_CONVERSION_STANDARDTOSPIRV_TD -#define MLIR_CONVERSION_STANDARDTOSPIRV_TD - -include "mlir/Dialect/StandardOps/IR/Ops.td" -include "mlir/Dialect/SPIRV/SPIRVOps.td" - -class BinaryOpPattern : - Pat<(src SPV_ScalarOrVectorOf:$l, SPV_ScalarOrVectorOf:$r), - (tgt $l, $r)>; - -class UnaryOpPattern : - Pat<(src type:$input), - (tgt $input)>; - -def : BinaryOpPattern; -def : BinaryOpPattern; -def : BinaryOpPattern; -def : BinaryOpPattern; -def : BinaryOpPattern; -def : BinaryOpPattern; -def : BinaryOpPattern; -def : BinaryOpPattern; -def : BinaryOpPattern; -def : BinaryOpPattern; -def : BinaryOpPattern; -def : BinaryOpPattern; -def : BinaryOpPattern; - -def : UnaryOpPattern; -def : UnaryOpPattern; -def : UnaryOpPattern; - -// Constant Op -// TODO(ravishankarm): Handle lowering other constant types. -def : Pat<(ConstantOp:$result $valueAttr), - (SPV_ConstantOp $valueAttr), - [(SPV_ScalarOrVector $result)]>; - -#endif // MLIR_CONVERSION_STANDARDTOSPIRV_TD diff --git a/mlir/lib/Conversion/StandardToStandard/CMakeLists.txt b/mlir/lib/Conversion/StandardToStandard/CMakeLists.txt new file mode 100644 index 00000000000000..e1bc42a746ee9f --- /dev/null +++ b/mlir/lib/Conversion/StandardToStandard/CMakeLists.txt @@ -0,0 +1,13 @@ +add_mlir_conversion_library(MLIRStandardToStandard + StandardToStandard.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/StandardToStandard + ) +target_link_libraries(MLIRStandardToStandard + PUBLIC + MLIRIR + MLIRPass + MLIRStandardOps + MLIRTransforms + ) diff --git a/mlir/lib/Conversion/StandardToStandard/StandardToStandard.cpp b/mlir/lib/Conversion/StandardToStandard/StandardToStandard.cpp new file mode 100644 index 00000000000000..e4734f31fd6336 --- /dev/null +++ b/mlir/lib/Conversion/StandardToStandard/StandardToStandard.cpp @@ -0,0 +1,49 @@ +//===- StandardToStandard.cpp - Std intra-dialect lowering ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Conversion/StandardToStandard/StandardToStandard.h" +#include "mlir/Dialect/StandardOps/IR/Ops.h" +#include "mlir/Transforms/DialectConversion.h" + +using namespace mlir; + +namespace { +// Converts the operand and result types of the Standard's CallOp, used together +// with the FuncOpSignatureConversion. +struct CallOpSignatureConversion : public OpConversionPattern { + CallOpSignatureConversion(MLIRContext *ctx, TypeConverter &converter) + : OpConversionPattern(ctx), converter(converter) {} + + /// Hook for derived classes to implement combined matching and rewriting. + LogicalResult + matchAndRewrite(CallOp callOp, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + FunctionType type = callOp.getCalleeType(); + + // Convert the original function results. + SmallVector convertedResults; + if (failed(converter.convertTypes(type.getResults(), convertedResults))) + return failure(); + + // Substitute with the new result types from the corresponding FuncType + // conversion. + rewriter.replaceOpWithNewOp(callOp, callOp.callee(), + convertedResults, operands); + return success(); + } + + /// The type converter to use when rewriting the signature. + TypeConverter &converter; +}; +} // end anonymous namespace + +void mlir::populateCallOpTypeConversionPattern( + OwningRewritePatternList &patterns, MLIRContext *ctx, + TypeConverter &converter) { + patterns.insert(ctx, converter); +} diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp index 2a2e6699fee5e6..c69530b28e29c4 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp @@ -407,6 +407,11 @@ static ParseResult parseInvokeOp(OpAsmParser &parser, OperationState &result) { static LogicalResult verify(LandingpadOp op) { Value value; + if (LLVMFuncOp func = op.getParentOfType()) { + if (!func.personality().hasValue()) + return op.emitError( + "llvm.landingpad needs to be in a function with a personality"); + } if (!op.cleanup() && op.getOperands().empty()) return op.emitError("landingpad instruction expects at least one clause or " diff --git a/mlir/lib/Dialect/Linalg/Transforms/LinalgToLoops.cpp b/mlir/lib/Dialect/Linalg/Transforms/LinalgToLoops.cpp index 4dc41e2c87ae04..817f0235ccff34 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/LinalgToLoops.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/LinalgToLoops.cpp @@ -185,9 +185,9 @@ class LinalgScopedEmitter { if (!convOp.padding()) return im(imIdx); + auto *context = ScopedContext::getContext(); ValueHandle zeroIndex = std_constant_index(0); - SmallVector conds = { - std_constant_int(/*value=*/1, /*width=*/1)}; + SmallVector conds; SmallVector clampedImIdx; for (auto iter : llvm::enumerate(imIdx)) { int idx = iter.index(); @@ -201,13 +201,16 @@ class LinalgScopedEmitter { using edsc::op::operator<; using edsc::op::operator>=; using edsc::op::operator||; - conds.push_back(conds.back() || (dim < zeroIndex)); - ValueHandle bound = std_dim(convOp.input(), idx); - conds.push_back(conds.back() || (dim >= bound)); + ValueHandle leftOutOfBound = dim < zeroIndex; + if (conds.empty()) + conds.push_back(leftOutOfBound); + else + conds.push_back(conds.back() || leftOutOfBound); + ValueHandle rightBound = std_dim(convOp.input(), idx); + conds.push_back(conds.back() || (dim >= rightBound)); // When padding is involed, the indices will only be shifted to negative, // so having a max op is enough. - auto *context = ScopedContext::getContext(); auto maxMap = AffineMap::get(/*dimCount=*/1, 0, {getAffineDimExpr(/*position=*/0, context), getAffineConstantExpr(0, context)}); @@ -219,7 +222,8 @@ class LinalgScopedEmitter { Type type = convOp.input().getType().cast().getElementType(); ValueHandle zero = std_constant(type, b.getZeroAttr(type)); ValueHandle readInput = im(clampedImIdx); - return std_select(conds.back(), zero, readInput); + return conds.empty() ? readInput + : std_select(conds.back(), zero, readInput); } static void emitScalarImplementation(ArrayRef allIvs, ConvOp convOp) { diff --git a/mlir/lib/Dialect/SPIRV/LayoutUtils.cpp b/mlir/lib/Dialect/SPIRV/LayoutUtils.cpp index 44930b91e0ffd4..d4ce17c93706d8 100644 --- a/mlir/lib/Dialect/SPIRV/LayoutUtils.cpp +++ b/mlir/lib/Dialect/SPIRV/LayoutUtils.cpp @@ -59,7 +59,7 @@ VulkanLayoutUtils::decorateType(spirv::StructType structType, Type VulkanLayoutUtils::decorateType(Type type, VulkanLayoutUtils::Size &size, VulkanLayoutUtils::Size &alignment) { - if (spirv::SPIRVDialect::isValidScalarType(type)) { + if (type.isa()) { alignment = VulkanLayoutUtils::getScalarTypeAlignment(type); // Vulkan spec does not specify any padding for a scalar type. size = alignment; diff --git a/mlir/lib/Dialect/SPIRV/SPIRVCanonicalization.cpp b/mlir/lib/Dialect/SPIRV/SPIRVCanonicalization.cpp index f378047f36eaca..953d95b449d153 100644 --- a/mlir/lib/Dialect/SPIRV/SPIRVCanonicalization.cpp +++ b/mlir/lib/Dialect/SPIRV/SPIRVCanonicalization.cpp @@ -14,6 +14,7 @@ #include "mlir/Dialect/CommonFolders.h" #include "mlir/Dialect/SPIRV/SPIRVDialect.h" +#include "mlir/Dialect/SPIRV/SPIRVTypes.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/PatternMatch.h" #include "mlir/Support/Functional.h" @@ -358,15 +359,6 @@ struct ConvertSelectionOpToSelect rhs.getOperation()->getAttrList().getDictionary(); } - // Checks that given type is valid for `spv.SelectOp`. - // According to SPIR-V spec: - // "Before version 1.4, Result Type must be a pointer, scalar, or vector. - // Starting with version 1.4, Result Type can additionally be a composite type - // other than a vector." - bool isValidType(Type type) const { - return spirv::SPIRVDialect::isValidScalarType(type) || - type.isa(); - } // Returns a source value for the given block. Value getSrcValue(Block *block) const { @@ -401,11 +393,20 @@ LogicalResult ConvertSelectionOpToSelect::canCanonicalizeSelection( return failure(); } + // Checks that given type is valid for `spv.SelectOp`. + // According to SPIR-V spec: + // "Before version 1.4, Result Type must be a pointer, scalar, or vector. + // Starting with version 1.4, Result Type can additionally be a composite type + // other than a vector." + bool isScalarOrVector = trueBrStoreOp.value() + .getType() + .cast() + .isScalarOrVector(); + // Check that each `spv.Store` uses the same pointer, memory access // attributes and a valid type of the value. if ((trueBrStoreOp.ptr() != falseBrStoreOp.ptr()) || - !isSameAttrList(trueBrStoreOp, falseBrStoreOp) || - !isValidType(trueBrStoreOp.value().getType())) { + !isSameAttrList(trueBrStoreOp, falseBrStoreOp) || !isScalarOrVector) { return failure(); } diff --git a/mlir/lib/Dialect/SPIRV/SPIRVDialect.cpp b/mlir/lib/Dialect/SPIRV/SPIRVDialect.cpp index 50ecf9ef7cbdaf..8ed417cad58d86 100644 --- a/mlir/lib/Dialect/SPIRV/SPIRVDialect.cpp +++ b/mlir/lib/Dialect/SPIRV/SPIRVDialect.cpp @@ -61,7 +61,7 @@ struct SPIRVInlinerInterface : public DialectInlinerInterface { BlockAndValueMapping &) const final { // Return true here when inlining into spv.func, spv.selection, and // spv.loop operations. - auto op = dest->getParentOp(); + auto *op = dest->getParentOp(); return isa(op) || isa(op) || isa(op); } @@ -152,42 +152,6 @@ template <> Optional parseAndVerify(SPIRVDialect const &dialect, DialectAsmParser &parser); -static bool isValidSPIRVIntType(IntegerType type) { - return llvm::is_contained(ArrayRef({1, 8, 16, 32, 64}), - type.getWidth()); -} - -bool SPIRVDialect::isValidScalarType(Type type) { - if (type.isa()) { - return !type.isBF16(); - } - if (auto intType = type.dyn_cast()) { - return isValidSPIRVIntType(intType); - } - return false; -} - -static bool isValidSPIRVVectorType(VectorType type) { - return type.getRank() == 1 && - SPIRVDialect::isValidScalarType(type.getElementType()) && - type.getNumElements() >= 2 && type.getNumElements() <= 4; -} - -bool SPIRVDialect::isValidType(Type type) { - // Allow SPIR-V dialect types - if (type.getKind() >= Type::FIRST_SPIRV_TYPE && - type.getKind() <= TypeKind::LAST_SPIRV_TYPE) { - return true; - } - if (SPIRVDialect::isValidScalarType(type)) { - return true; - } - if (auto vectorType = type.dyn_cast()) { - return isValidSPIRVVectorType(vectorType); - } - return false; -} - static Type parseAndVerifyType(SPIRVDialect const &dialect, DialectAsmParser &parser) { Type type; @@ -206,7 +170,7 @@ static Type parseAndVerifyType(SPIRVDialect const &dialect, return Type(); } } else if (auto t = type.dyn_cast()) { - if (!isValidSPIRVIntType(t)) { + if (!ScalarType::isValid(t)) { parser.emitError(typeLoc, "only 1/8/16/32/64-bit integer type allowed but found ") << type; @@ -383,7 +347,8 @@ namespace { // parseAndVerify does the actual parsing and verification of individual // elements. This is a functor since parsing the last element of the list // (termination condition) needs partial specialization. -template struct parseCommaSeparatedList { +template +struct ParseCommaSeparatedList { Optional> operator()(SPIRVDialect const &dialect, DialectAsmParser &parser) const { auto parseVal = parseAndVerify(dialect, parser); @@ -393,7 +358,7 @@ template struct parseCommaSeparatedList { auto numArgs = std::tuple_size>::value; if (numArgs != 0 && failed(parser.parseComma())) return llvm::None; - auto remainingValues = parseCommaSeparatedList{}(dialect, parser); + auto remainingValues = ParseCommaSeparatedList{}(dialect, parser); if (!remainingValues) return llvm::None; return std::tuple_cat(std::tuple(parseVal.getValue()), @@ -403,7 +368,8 @@ template struct parseCommaSeparatedList { // Partial specialization of the function to parse a comma separated list of // specs to parse the last element of the list. -template struct parseCommaSeparatedList { +template +struct ParseCommaSeparatedList { Optional> operator()(SPIRVDialect const &dialect, DialectAsmParser &parser) const { if (auto value = parseAndVerify(dialect, parser)) @@ -434,7 +400,7 @@ static Type parseImageType(SPIRVDialect const &dialect, return Type(); auto value = - parseCommaSeparatedList{}(dialect, parser); if (!value) @@ -597,10 +563,10 @@ static void print(StructType type, DialectAsmPrinter &os) { if (!decorations.empty()) os << ", "; } - auto each_fn = [&os](spirv::Decoration decoration) { + auto eachFn = [&os](spirv::Decoration decoration) { os << stringifyDecoration(decoration); }; - interleaveComma(decorations, os, each_fn); + interleaveComma(decorations, os, eachFn); os << "]"; } }; @@ -865,39 +831,44 @@ LogicalResult SPIRVDialect::verifyOperationAttribute(Operation *op, return success(); } -// Verifies the given SPIR-V `attribute` attached to a region's argument or -// result and reports error to the given location if invalid. -static LogicalResult -verifyRegionAttribute(Location loc, NamedAttribute attribute, bool forArg) { +/// Verifies the given SPIR-V `attribute` attached to a value of the given +/// `valueType` is valid. +static LogicalResult verifyRegionAttribute(Location loc, Type valueType, + NamedAttribute attribute) { StringRef symbol = attribute.first.strref(); Attribute attr = attribute.second; if (symbol != spirv::getInterfaceVarABIAttrName()) return emitError(loc, "found unsupported '") - << symbol << "' attribute on region " - << (forArg ? "argument" : "result"); + << symbol << "' attribute on region argument"; - if (!attr.isa()) + auto varABIAttr = attr.dyn_cast(); + if (!varABIAttr) return emitError(loc, "'") << symbol - << "' attribute must be a dictionary attribute containing three " - "32-bit integer attributes: 'descriptor_set', 'binding', and " - "'storage_class'"; + << "' attribute must be a dictionary attribute containing two or " + "three 32-bit integer attributes: 'descriptor_set', 'binding', " + "and optional 'storage_class'"; + if (varABIAttr.storage_class() && !valueType.isIntOrIndexOrFloat()) + return emitError(loc, "'") << symbol + << "' attribute cannot specify storage class " + "when attaching to a non-scalar value"; return success(); } LogicalResult SPIRVDialect::verifyRegionArgAttribute(Operation *op, - unsigned /*regionIndex*/, - unsigned /*argIndex*/, + unsigned regionIndex, + unsigned argIndex, NamedAttribute attribute) { - return verifyRegionAttribute(op->getLoc(), attribute, - /*forArg=*/true); + return verifyRegionAttribute( + op->getLoc(), + op->getRegion(regionIndex).front().getArgument(argIndex).getType(), + attribute); } LogicalResult SPIRVDialect::verifyRegionResultAttribute( Operation *op, unsigned /*regionIndex*/, unsigned /*resultIndex*/, NamedAttribute attribute) { - return verifyRegionAttribute(op->getLoc(), attribute, - /*forArg=*/false); + return op->emitError("cannot attach SPIR-V attributes to region result"); } diff --git a/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp b/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp index 4adabdaa597eae..3fd987b0e56576 100644 --- a/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp +++ b/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp @@ -1,4 +1,4 @@ -//===- SPIRVLowering.cpp - Standard to SPIR-V dialect conversion--===// +//===- SPIRVLowering.cpp - SPIR-V lowering utilities ----------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -15,6 +15,7 @@ #include "mlir/Dialect/SPIRV/SPIRVDialect.h" #include "mlir/Dialect/SPIRV/SPIRVOps.h" #include "llvm/ADT/Sequence.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Support/Debug.h" #include @@ -23,6 +24,64 @@ using namespace mlir; +//===----------------------------------------------------------------------===// +// Utility functions +//===----------------------------------------------------------------------===// + +/// Checks that `candidates` extension requirements are possible to be satisfied +/// with the given `targetEnv`. +/// +/// `candidates` is a vector of vector for extension requirements following +/// ((Extension::A OR Extension::B) AND (Extension::C OR Extension::D)) +/// convention. +template +static LogicalResult checkExtensionRequirements( + LabelT label, const spirv::TargetEnv &targetEnv, + const spirv::SPIRVType::ExtensionArrayRefVector &candidates) { + for (const auto &ors : candidates) { + if (targetEnv.allows(ors)) + continue; + + SmallVector extStrings; + for (spirv::Extension ext : ors) + extStrings.push_back(spirv::stringifyExtension(ext)); + + LLVM_DEBUG(llvm::dbgs() + << label << " illegal: requires at least one extension in [" + << llvm::join(extStrings, ", ") + << "] but none allowed in target environment\n"); + return failure(); + } + return success(); +} + +/// Checks that `candidates`capability requirements are possible to be satisfied +/// with the given `isAllowedFn`. +/// +/// `candidates` is a vector of vector for capability requirements following +/// ((Capability::A OR Capability::B) AND (Capability::C OR Capability::D)) +/// convention. +template +static LogicalResult checkCapabilityRequirements( + LabelT label, const spirv::TargetEnv &targetEnv, + const spirv::SPIRVType::CapabilityArrayRefVector &candidates) { + for (const auto &ors : candidates) { + if (targetEnv.allows(ors)) + continue; + + SmallVector capStrings; + for (spirv::Capability cap : ors) + capStrings.push_back(spirv::stringifyCapability(cap)); + + LLVM_DEBUG(llvm::dbgs() + << label << " illegal: requires at least one capability in [" + << llvm::join(capStrings, ", ") + << "] but none allowed in target environment\n"); + return failure(); + } + return success(); +} + //===----------------------------------------------------------------------===// // Type Conversion //===----------------------------------------------------------------------===// @@ -38,10 +97,67 @@ Type SPIRVTypeConverter::getIndexType(MLIRContext *context) { return IntegerType::get(32, context); } +/// Mapping between SPIR-V storage classes to memref memory spaces. +/// +/// Note: memref does not have a defined smenatics for each memory space; it +/// depends on the context where it is used. There are no particular reasons +/// behind the number assigments; we try to follow NVVM conventions and largely +/// give common storage classes a smaller number. The hope is use symbolic +/// memory space representation eventually after memref supports it. +// TODO(antiagainst): swap Generic and StorageBuffer assignment to be more akin +// to NVVM. +#define STORAGE_SPACE_MAP_LIST(MAP_FN) \ + MAP_FN(spirv::StorageClass::Generic, 1) \ + MAP_FN(spirv::StorageClass::StorageBuffer, 0) \ + MAP_FN(spirv::StorageClass::Workgroup, 3) \ + MAP_FN(spirv::StorageClass::Uniform, 4) \ + MAP_FN(spirv::StorageClass::Private, 5) \ + MAP_FN(spirv::StorageClass::Function, 6) \ + MAP_FN(spirv::StorageClass::PushConstant, 7) \ + MAP_FN(spirv::StorageClass::UniformConstant, 8) \ + MAP_FN(spirv::StorageClass::Input, 9) \ + MAP_FN(spirv::StorageClass::Output, 10) \ + MAP_FN(spirv::StorageClass::CrossWorkgroup, 11) \ + MAP_FN(spirv::StorageClass::AtomicCounter, 12) \ + MAP_FN(spirv::StorageClass::Image, 13) \ + MAP_FN(spirv::StorageClass::CallableDataNV, 14) \ + MAP_FN(spirv::StorageClass::IncomingCallableDataNV, 15) \ + MAP_FN(spirv::StorageClass::RayPayloadNV, 16) \ + MAP_FN(spirv::StorageClass::HitAttributeNV, 17) \ + MAP_FN(spirv::StorageClass::IncomingRayPayloadNV, 18) \ + MAP_FN(spirv::StorageClass::ShaderRecordBufferNV, 19) \ + MAP_FN(spirv::StorageClass::PhysicalStorageBuffer, 20) + +unsigned +SPIRVTypeConverter::getMemorySpaceForStorageClass(spirv::StorageClass storage) { +#define STORAGE_SPACE_MAP_FN(storage, space) \ + case storage: \ + return space; + + switch (storage) { STORAGE_SPACE_MAP_LIST(STORAGE_SPACE_MAP_FN) } +#undef STORAGE_SPACE_MAP_FN +} + +Optional +SPIRVTypeConverter::getStorageClassForMemorySpace(unsigned space) { +#define STORAGE_SPACE_MAP_FN(storage, space) \ + case space: \ + return storage; + + switch (space) { + STORAGE_SPACE_MAP_LIST(STORAGE_SPACE_MAP_FN) + default: + return llvm::None; + } +#undef STORAGE_SPACE_MAP_FN +} + +#undef STORAGE_SPACE_MAP_LIST + // TODO(ravishankarm): This is a utility function that should probably be // exposed by the SPIR-V dialect. Keeping it local till the use case arises. static Optional getTypeNumBytes(Type t) { - if (spirv::SPIRVDialect::isValidScalarType(t)) { + if (t.isa()) { auto bitWidth = t.getIntOrFloatBitWidth(); // According to the SPIR-V spec: // "There is no physical size or bit pattern defined for values with boolean @@ -101,68 +217,212 @@ static Optional getTypeNumBytes(Type t) { return llvm::None; } -SPIRVTypeConverter::SPIRVTypeConverter() { - addConversion([](Type type) -> Optional { - // If the type is already valid in SPIR-V, directly return. - return spirv::SPIRVDialect::isValidType(type) ? type : Optional(); - }); +/// Converts a scalar `type` to a suitable type under the given `targetEnv`. +static Optional +convertScalarType(const spirv::TargetEnv &targetEnv, spirv::ScalarType type, + Optional storageClass = {}) { + // Get extension and capability requirements for the given type. + SmallVector, 1> extensions; + SmallVector, 2> capabilities; + type.getExtensions(extensions, storageClass); + type.getCapabilities(capabilities, storageClass); + + // If all requirements are met, then we can accept this type as-is. + if (succeeded(checkCapabilityRequirements(type, targetEnv, capabilities)) && + succeeded(checkExtensionRequirements(type, targetEnv, extensions))) + return type; + + // Otherwise we need to adjust the type, which really means adjusting the + // bitwidth given this is a scalar type. + // TODO(antiagainst): We are unconditionally converting the bitwidth here, + // this might be okay for non-interface types (i.e., types used in + // Priviate/Function storage classes), but not for interface types (i.e., + // types used in StorageBuffer/Uniform/PushConstant/etc. storage classes). + // This is because the later actually affects the ABI contract with the + // runtime. So we may want to expose a control on SPIRVTypeConverter to fail + // conversion if we cannot change there. + + if (auto floatType = type.dyn_cast()) { + LLVM_DEBUG(llvm::dbgs() << type << " converted to 32-bit for SPIR-V\n"); + return Builder(targetEnv.getContext()).getF32Type(); + } + + auto intType = type.cast(); + LLVM_DEBUG(llvm::dbgs() << type << " converted to 32-bit for SPIR-V\n"); + return IntegerType::get(/*width=*/32, intType.getSignedness(), + targetEnv.getContext()); +} + +/// Converts a vector `type` to a suitable type under the given `targetEnv`. +static Optional +convertVectorType(const spirv::TargetEnv &targetEnv, VectorType type, + Optional storageClass = {}) { + if (!spirv::CompositeType::isValid(type)) { + // TODO(antiagainst): One-element vector types can be translated into scalar + // types. Vector types with more than four elements can be translated into + // array types. + LLVM_DEBUG(llvm::dbgs() + << type << " illegal: 1- and > 4-element unimplemented\n"); + return llvm::None; + } + + // Get extension and capability requirements for the given type. + SmallVector, 1> extensions; + SmallVector, 2> capabilities; + type.cast().getExtensions(extensions, storageClass); + type.cast().getCapabilities(capabilities, storageClass); + + // If all requirements are met, then we can accept this type as-is. + if (succeeded(checkCapabilityRequirements(type, targetEnv, capabilities)) && + succeeded(checkExtensionRequirements(type, targetEnv, extensions))) + return type; + + auto elementType = convertScalarType( + targetEnv, type.getElementType().cast(), storageClass); + if (elementType) + return VectorType::get(type.getShape(), *elementType); + return llvm::None; +} + +/// Converts a tensor `type` to a suitable type under the given `targetEnv`. +/// +/// Note that this is mainly for lowering constant tensors.In SPIR-V one can +/// create composite constants with OpConstantComposite to embed relative large +/// constant values and use OpCompositeExtract and OpCompositeInsert to +/// manipulate, like what we do for vectors. +static Optional convertTensorType(const spirv::TargetEnv &targetEnv, + TensorType type) { + // TODO(ravishankarm) : Handle dynamic shapes. + if (!type.hasStaticShape()) { + LLVM_DEBUG(llvm::dbgs() + << type << " illegal: dynamic shape unimplemented\n"); + return llvm::None; + } + + auto scalarType = type.getElementType().dyn_cast(); + if (!scalarType) { + LLVM_DEBUG(llvm::dbgs() + << type << " illegal: cannot convert non-scalar element type\n"); + return llvm::None; + } + + Optional scalarSize = getTypeNumBytes(scalarType); + Optional tensorSize = getTypeNumBytes(type); + if (!scalarSize || !tensorSize) { + LLVM_DEBUG(llvm::dbgs() + << type << " illegal: cannot deduce element count\n"); + return llvm::None; + } + + auto arrayElemCount = *tensorSize / *scalarSize; + auto arrayElemType = convertScalarType(targetEnv, scalarType); + if (!arrayElemType) + return llvm::None; + Optional arrayElemSize = getTypeNumBytes(*arrayElemType); + if (!arrayElemSize) { + LLVM_DEBUG(llvm::dbgs() + << type << " illegal: cannot deduce converted element size\n"); + return llvm::None; + } + + return spirv::ArrayType::get(*arrayElemType, arrayElemCount, *arrayElemSize); +} + +static Optional convertMemrefType(const spirv::TargetEnv &targetEnv, + MemRefType type) { + // TODO(ravishankarm) : Handle dynamic shapes. + if (!type.hasStaticShape()) { + LLVM_DEBUG(llvm::dbgs() + << type << " illegal: dynamic shape unimplemented\n"); + return llvm::None; + } + + auto scalarType = type.getElementType().dyn_cast(); + if (!scalarType) { + LLVM_DEBUG(llvm::dbgs() + << type << " illegal: cannot convert non-scalar element type\n"); + return llvm::None; + } + + Optional scalarSize = getTypeNumBytes(scalarType); + Optional memrefSize = getTypeNumBytes(type); + if (!scalarSize || !memrefSize) { + LLVM_DEBUG(llvm::dbgs() + << type << " illegal: cannot deduce element count\n"); + return llvm::None; + } + + auto arrayElemCount = *memrefSize / *scalarSize; + + auto storageClass = + SPIRVTypeConverter::getStorageClassForMemorySpace(type.getMemorySpace()); + if (!storageClass) { + LLVM_DEBUG(llvm::dbgs() + << type << " illegal: cannot convert memory space\n"); + return llvm::None; + } + + auto arrayElemType = convertScalarType(targetEnv, scalarType, storageClass); + if (!arrayElemType) + return llvm::None; + Optional arrayElemSize = getTypeNumBytes(*arrayElemType); + if (!arrayElemSize) { + LLVM_DEBUG(llvm::dbgs() + << type << " illegal: cannot deduce converted element size\n"); + return llvm::None; + } + + auto arrayType = + spirv::ArrayType::get(*arrayElemType, arrayElemCount, *arrayElemSize); + + // Wrap in a struct to satisfy Vulkan interface requirements. + auto structType = spirv::StructType::get(arrayType, 0); + return spirv::PointerType::get(structType, *storageClass); +} + +SPIRVTypeConverter::SPIRVTypeConverter(spirv::TargetEnvAttr targetAttr) + : targetEnv(targetAttr) { + // Add conversions. The order matters here: later ones will be tried earlier. + + // All other cases failed. Then we cannot convert this type. + addConversion([](Type type) { return llvm::None; }); + + // Allow all SPIR-V dialect specific types. This assumes all standard types + // adopted in the SPIR-V dialect (i.e., IntegerType, FloatType, VectorType) + // were tried before. + // + // TODO(antiagainst): this assumes that the SPIR-V types are valid to use in + // the given target environment, which should be the case if the whole + // pipeline is driven by the same target environment. Still, we probably still + // want to validate and convert to be safe. + addConversion([](spirv::SPIRVType type) { return type; }); + addConversion([](IndexType indexType) { return SPIRVTypeConverter::getIndexType(indexType.getContext()); }); - addConversion([this](MemRefType memRefType) -> Type { - // TODO(ravishankarm): For now only support default memory space. The memory - // space description is not set is stone within MLIR, i.e. it depends on the - // context it is being used. To map this to SPIR-V storage classes, we - // should rely on the ABI attributes, and not on the memory space. This is - // still evolving, and needs to be revisited when there is more clarity. - if (memRefType.getMemorySpace()) - return Type(); - - auto elementType = convertType(memRefType.getElementType()); - if (!elementType) - return Type(); - - auto elementSize = getTypeNumBytes(elementType); - if (!elementSize) - return Type(); - - // TODO(ravishankarm) : Handle dynamic shapes. - if (memRefType.hasStaticShape()) { - auto arraySize = getTypeNumBytes(memRefType); - if (!arraySize) - return Type(); - - auto arrayType = spirv::ArrayType::get( - elementType, arraySize.getValue() / elementSize.getValue(), - elementSize.getValue()); - auto structType = spirv::StructType::get(arrayType, 0); - // For now initialize the storage class to StorageBuffer. This will be - // updated later based on whats passed in w.r.t to the ABI attributes. - return spirv::PointerType::get(structType, - spirv::StorageClass::StorageBuffer); - } - return Type(); + + addConversion([this](IntegerType intType) -> Optional { + if (auto scalarType = intType.dyn_cast()) + return convertScalarType(targetEnv, scalarType); + return llvm::None; + }); + + addConversion([this](FloatType floatType) -> Optional { + if (auto scalarType = floatType.dyn_cast()) + return convertScalarType(targetEnv, scalarType); + return llvm::None; + }); + + addConversion([this](VectorType vectorType) { + return convertVectorType(targetEnv, vectorType); + }); + + addConversion([this](TensorType tensorType) { + return convertTensorType(targetEnv, tensorType); }); - addConversion([this](TensorType tensorType) -> Type { - // TODO(ravishankarm) : Handle dynamic shapes. - if (!tensorType.hasStaticShape()) - return Type(); - - auto elementType = convertType(tensorType.getElementType()); - if (!elementType) - return Type(); - - auto elementSize = getTypeNumBytes(elementType); - if (!elementSize) - return Type(); - - auto tensorSize = getTypeNumBytes(tensorType); - if (!tensorSize) - return Type(); - - return spirv::ArrayType::get(elementType, - tensorSize.getValue() / elementSize.getValue(), - elementSize.getValue()); + + addConversion([this](MemRefType memRefType) { + return convertMemrefType(targetEnv, memRefType); }); } @@ -360,11 +620,10 @@ mlir::spirv::setABIAttrs(spirv::FuncOp funcOp, //===----------------------------------------------------------------------===// std::unique_ptr -spirv::SPIRVConversionTarget::get(spirv::TargetEnvAttr targetEnv, - MLIRContext *context) { +spirv::SPIRVConversionTarget::get(spirv::TargetEnvAttr targetAttr) { std::unique_ptr target( // std::make_unique does not work here because the constructor is private. - new SPIRVConversionTarget(targetEnv, context)); + new SPIRVConversionTarget(targetAttr)); SPIRVConversionTarget *targetPtr = target.get(); target->addDynamicallyLegalDialect( Optional( @@ -375,30 +634,15 @@ spirv::SPIRVConversionTarget::get(spirv::TargetEnvAttr targetEnv, } spirv::SPIRVConversionTarget::SPIRVConversionTarget( - spirv::TargetEnvAttr targetEnv, MLIRContext *context) - : ConversionTarget(*context), givenVersion(targetEnv.getVersion()) { - for (spirv::Extension ext : targetEnv.getExtensions()) - givenExtensions.insert(ext); - - // Add extensions implied by the current version. - for (spirv::Extension ext : spirv::getImpliedExtensions(givenVersion)) - givenExtensions.insert(ext); - - for (spirv::Capability cap : targetEnv.getCapabilities()) { - givenCapabilities.insert(cap); - - // Add capabilities implied by the current capability. - for (spirv::Capability c : spirv::getRecursiveImpliedCapabilities(cap)) - givenCapabilities.insert(c); - } -} + spirv::TargetEnvAttr targetAttr) + : ConversionTarget(*targetAttr.getContext()), targetEnv(targetAttr) {} bool spirv::SPIRVConversionTarget::isLegalOp(Operation *op) { // Make sure this op is available at the given version. Ops not implementing // QueryMinVersionInterface/QueryMaxVersionInterface are available to all // SPIR-V versions. if (auto minVersion = dyn_cast(op)) - if (minVersion.getMinVersion() > givenVersion) { + if (minVersion.getMinVersion() > this->targetEnv.getVersion()) { LLVM_DEBUG(llvm::dbgs() << op->getName() << " illegal: requiring min version " << spirv::stringifyVersion(minVersion.getMinVersion()) @@ -406,7 +650,7 @@ bool spirv::SPIRVConversionTarget::isLegalOp(Operation *op) { return false; } if (auto maxVersion = dyn_cast(op)) - if (maxVersion.getMaxVersion() < givenVersion) { + if (maxVersion.getMaxVersion() < this->targetEnv.getVersion()) { LLVM_DEBUG(llvm::dbgs() << op->getName() << " illegal: requiring max version " << spirv::stringifyVersion(maxVersion.getMaxVersion()) @@ -414,38 +658,47 @@ bool spirv::SPIRVConversionTarget::isLegalOp(Operation *op) { return false; } - // Make sure this op's required extensions are allowed to use. For each op, - // we return a vector of vector for its extension requirements following - // ((Extension::A OR Extension::B) AND (Extension::C OR Extension::D)) - // convention. Ops not implementing QueryExtensionInterface do not require - // extensions to be available. - if (auto extensions = dyn_cast(op)) { - auto exts = extensions.getExtensions(); - for (const auto &ors : exts) - if (llvm::all_of(ors, [this](spirv::Extension ext) { - return this->givenExtensions.count(ext) == 0; - })) { - LLVM_DEBUG(llvm::dbgs() << op->getName() - << " illegal: missing required extension\n"); - return false; - } - } + // Make sure this op's required extensions are allowed to use. Ops not + // implementing QueryExtensionInterface do not require extensions to be + // available. + if (auto extensions = dyn_cast(op)) + if (failed(checkExtensionRequirements(op->getName(), this->targetEnv, + extensions.getExtensions()))) + return false; - // Make sure this op's required extensions are allowed to use. For each op, - // we return a vector of vector for its capability requirements following - // ((Capability::A OR Extension::B) AND (Capability::C OR Capability::D)) - // convention. Ops not implementing QueryExtensionInterface do not require - // extensions to be available. - if (auto capabilities = dyn_cast(op)) { - auto caps = capabilities.getCapabilities(); - for (const auto &ors : caps) - if (llvm::all_of(ors, [this](spirv::Capability cap) { - return this->givenCapabilities.count(cap) == 0; - })) { - LLVM_DEBUG(llvm::dbgs() << op->getName() - << " illegal: missing required capability\n"); - return false; - } + // Make sure this op's required extensions are allowed to use. Ops not + // implementing QueryCapabilityInterface do not require capabilities to be + // available. + if (auto capabilities = dyn_cast(op)) + if (failed(checkCapabilityRequirements(op->getName(), this->targetEnv, + capabilities.getCapabilities()))) + return false; + + SmallVector valueTypes; + valueTypes.append(op->operand_type_begin(), op->operand_type_end()); + valueTypes.append(op->result_type_begin(), op->result_type_end()); + + // Special treatment for global variables, whose type requirements are + // conveyed by type attributes. + if (auto globalVar = dyn_cast(op)) + valueTypes.push_back(globalVar.type()); + + // Make sure the op's operands/results use types that are allowed by the + // target environment. + SmallVector, 4> typeExtensions; + SmallVector, 8> typeCapabilities; + for (Type valueType : valueTypes) { + typeExtensions.clear(); + valueType.cast().getExtensions(typeExtensions); + if (failed(checkExtensionRequirements(op->getName(), this->targetEnv, + typeExtensions))) + return false; + + typeCapabilities.clear(); + valueType.cast().getCapabilities(typeCapabilities); + if (failed(checkCapabilityRequirements(op->getName(), this->targetEnv, + typeCapabilities))) + return false; } return true; diff --git a/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp b/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp index 377242482b2a7d..f6b862156c49ef 100644 --- a/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp +++ b/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp @@ -1373,7 +1373,7 @@ static LogicalResult verify(spirv::ConstantOp constOp) { bool spirv::ConstantOp::isBuildableWith(Type type) { // Must be valid SPIR-V type first. - if (!SPIRVDialect::isValidType(type)) + if (!type.isa()) return false; if (type.getKind() >= Type::FIRST_SPIRV_TYPE && @@ -2460,7 +2460,7 @@ static LogicalResult verify(spirv::SpecConstantOp constOp) { case StandardAttributes::Integer: case StandardAttributes::Float: { // Make sure bitwidth is allowed. - if (!spirv::SPIRVDialect::isValidType(value.getType())) + if (!value.getType().isa()) return constOp.emitOpError("default value bitwidth disallowed"); return success(); } diff --git a/mlir/lib/Dialect/SPIRV/SPIRVTypes.cpp b/mlir/lib/Dialect/SPIRV/SPIRVTypes.cpp index 92dc5b82bb8af8..3f963bd1d8a87b 100644 --- a/mlir/lib/Dialect/SPIRV/SPIRVTypes.cpp +++ b/mlir/lib/Dialect/SPIRV/SPIRVTypes.cpp @@ -163,13 +163,19 @@ bool CompositeType::classof(Type type) { case TypeKind::Array: case TypeKind::RuntimeArray: case TypeKind::Struct: - case StandardTypes::Vector: return true; + case StandardTypes::Vector: + return isValid(type.cast()); default: return false; } } +bool CompositeType::isValid(VectorType type) { + return type.getRank() == 1 && type.getElementType().isa() && + type.getNumElements() >= 2 && type.getNumElements() <= 4; +} + Type CompositeType::getElementType(unsigned index) const { switch (getKind()) { case spirv::TypeKind::Array: @@ -560,7 +566,30 @@ void RuntimeArrayType::getCapabilities( // ScalarType //===----------------------------------------------------------------------===// -bool ScalarType::classof(Type type) { return type.isIntOrFloat(); } +bool ScalarType::classof(Type type) { + if (auto floatType = type.dyn_cast()) { + return isValid(floatType); + } + if (auto intType = type.dyn_cast()) { + return isValid(intType); + } + return false; +} + +bool ScalarType::isValid(FloatType type) { return !type.isBF16(); } + +bool ScalarType::isValid(IntegerType type) { + switch (type.getWidth()) { + case 1: + case 8: + case 16: + case 32: + case 64: + return true; + default: + return false; + } +} void ScalarType::getExtensions(SPIRVType::ExtensionArrayRefVector &extensions, Optional storage) { @@ -678,9 +707,19 @@ void ScalarType::getCapabilities( //===----------------------------------------------------------------------===// bool SPIRVType::classof(Type type) { - return type.isa() || type.isa() || - (type.getKind() >= Type::FIRST_SPIRV_TYPE && - type.getKind() <= TypeKind::LAST_SPIRV_TYPE); + // Allow SPIR-V dialect types + if (type.getKind() >= Type::FIRST_SPIRV_TYPE && + type.getKind() <= TypeKind::LAST_SPIRV_TYPE) + return true; + if (type.isa()) + return true; + if (auto vectorType = type.dyn_cast()) + return CompositeType::isValid(vectorType); + return false; +} + +bool SPIRVType::isScalarOrVector() { + return isIntOrFloat() || isa(); } void SPIRVType::getExtensions(SPIRVType::ExtensionArrayRefVector &extensions, diff --git a/mlir/lib/Dialect/SPIRV/TargetAndABI.cpp b/mlir/lib/Dialect/SPIRV/TargetAndABI.cpp index 88f3037ccc1e83..491fcf9a6f21bf 100644 --- a/mlir/lib/Dialect/SPIRV/TargetAndABI.cpp +++ b/mlir/lib/Dialect/SPIRV/TargetAndABI.cpp @@ -15,19 +15,85 @@ using namespace mlir; +//===----------------------------------------------------------------------===// +// TargetEnv +//===----------------------------------------------------------------------===// + +spirv::TargetEnv::TargetEnv(spirv::TargetEnvAttr targetAttr) + : targetAttr(targetAttr) { + for (spirv::Extension ext : targetAttr.getExtensions()) + givenExtensions.insert(ext); + + // Add extensions implied by the current version. + for (spirv::Extension ext : + spirv::getImpliedExtensions(targetAttr.getVersion())) + givenExtensions.insert(ext); + + for (spirv::Capability cap : targetAttr.getCapabilities()) { + givenCapabilities.insert(cap); + + // Add capabilities implied by the current capability. + for (spirv::Capability c : spirv::getRecursiveImpliedCapabilities(cap)) + givenCapabilities.insert(c); + } +} + +spirv::Version spirv::TargetEnv::getVersion() { + return targetAttr.getVersion(); +} + +bool spirv::TargetEnv::allows(spirv::Capability capability) const { + return givenCapabilities.count(capability); +} + +Optional +spirv::TargetEnv::allows(ArrayRef caps) const { + auto chosen = llvm::find_if(caps, [this](spirv::Capability cap) { + return givenCapabilities.count(cap); + }); + if (chosen != caps.end()) + return *chosen; + return llvm::None; +} + +bool spirv::TargetEnv::allows(spirv::Extension extension) const { + return givenExtensions.count(extension); +} + +Optional +spirv::TargetEnv::allows(ArrayRef exts) const { + auto chosen = llvm::find_if(exts, [this](spirv::Extension ext) { + return givenExtensions.count(ext); + }); + if (chosen != exts.end()) + return *chosen; + return llvm::None; +} + +MLIRContext *spirv::TargetEnv::getContext() const { + return targetAttr.getContext(); +} + +//===----------------------------------------------------------------------===// +// Utility functions +//===----------------------------------------------------------------------===// + StringRef spirv::getInterfaceVarABIAttrName() { return "spv.interface_var_abi"; } spirv::InterfaceVarABIAttr spirv::getInterfaceVarABIAttr(unsigned descriptorSet, unsigned binding, - spirv::StorageClass storageClass, + Optional storageClass, MLIRContext *context) { Type i32Type = IntegerType::get(32, context); + auto scAttr = + storageClass + ? IntegerAttr::get(i32Type, static_cast(*storageClass)) + : IntegerAttr(); return spirv::InterfaceVarABIAttr::get( IntegerAttr::get(i32Type, descriptorSet), - IntegerAttr::get(i32Type, binding), - IntegerAttr::get(i32Type, static_cast(storageClass)), context); + IntegerAttr::get(i32Type, binding), scAttr, context); } StringRef spirv::getEntryPointABIAttrName() { return "spv.entry_point_abi"; } diff --git a/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp b/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp index 4dbc54ecfca2ec..1ca9cad977af08 100644 --- a/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp +++ b/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp @@ -21,33 +21,27 @@ using namespace mlir; -/// Checks if the `type` is a scalar or vector type. It is assumed that they are -/// valid for SPIR-V dialect already. -static bool isScalarOrVectorType(Type type) { - return spirv::SPIRVDialect::isValidScalarType(type) || type.isa(); -} - /// Creates a global variable for an argument based on the ABI info. static spirv::GlobalVariableOp -createGlobalVariableForArg(spirv::FuncOp funcOp, OpBuilder &builder, - unsigned argNum, - spirv::InterfaceVarABIAttr abiInfo) { +createGlobalVarForEntryPointArgument(OpBuilder &builder, spirv::FuncOp funcOp, + unsigned argIndex, + spirv::InterfaceVarABIAttr abiInfo) { auto spirvModule = funcOp.getParentOfType(); - if (!spirvModule) { + if (!spirvModule) return nullptr; - } + OpBuilder::InsertionGuard moduleInsertionGuard(builder); builder.setInsertionPoint(funcOp.getOperation()); std::string varName = - funcOp.getName().str() + "_arg_" + std::to_string(argNum); + funcOp.getName().str() + "_arg_" + std::to_string(argIndex); // Get the type of variable. If this is a scalar/vector type and has an ABI - // info create a variable of type !spv.ptr>. If not + // info create a variable of type !spv.ptr>. If not // it must already be a !spv.ptr>. - auto varType = funcOp.getType().getInput(argNum); - auto storageClass = - static_cast(abiInfo.storage_class().getInt()); - if (isScalarOrVectorType(varType)) { + auto varType = funcOp.getType().getInput(argIndex); + if (varType.cast().isScalarOrVector()) { + auto storageClass = + static_cast(abiInfo.storage_class().getInt()); varType = spirv::PointerType::get(spirv::StructType::get(varType), storageClass); } @@ -84,9 +78,18 @@ getInterfaceVariables(spirv::FuncOp funcOp, funcOp.walk([&](spirv::AddressOfOp addressOfOp) { auto var = module.lookupSymbol(addressOfOp.variable()); - if (var.type().cast().getStorageClass() != - spirv::StorageClass::StorageBuffer) { + // TODO(antiagainst): Per SPIR-V spec: "Before version 1.4, the interface’s + // storage classes are limited to the Input and Output storage classes. + // Starting with version 1.4, the interface’s storage classes are all + // storage classes used in declaring all global variables referenced by the + // entry point’s call tree." We should consider the target environment here. + switch (var.type().cast().getStorageClass()) { + case spirv::StorageClass::Input: + case spirv::StorageClass::Output: interfaceVarSet.insert(var.getOperation()); + break; + default: + break; } }); for (auto &var : interfaceVarSet) { @@ -173,11 +176,10 @@ LogicalResult ProcessInterfaceVarABI::matchAndRewrite( // produce an error. return failure(); } - auto var = - createGlobalVariableForArg(funcOp, rewriter, argType.index(), abiInfo); - if (!var) { + spirv::GlobalVariableOp var = createGlobalVarForEntryPointArgument( + rewriter, funcOp, argType.index(), abiInfo); + if (!var) return failure(); - } OpBuilder::InsertionGuard funcInsertionGuard(rewriter); rewriter.setInsertionPointToStart(&funcOp.front()); @@ -190,7 +192,7 @@ LogicalResult ProcessInterfaceVarABI::matchAndRewrite( // at the start of the function. It is probably better to do the load just // before the use. There might be multiple loads and currently there is no // easy way to replace all uses with a sequence of operations. - if (isScalarOrVectorType(argType.value())) { + if (argType.value().cast().isScalarOrVector()) { auto indexType = SPIRVTypeConverter::getIndexType(funcOp.getContext()); auto zero = spirv::ConstantOp::getZero(indexType, funcOp.getLoc(), &rewriter); @@ -216,7 +218,9 @@ void LowerABIAttributesPass::runOnOperation() { spirv::ModuleOp module = getOperation(); MLIRContext *context = &getContext(); - SPIRVTypeConverter typeConverter; + spirv::TargetEnv targetEnv(spirv::lookupTargetEnv(module)); + + SPIRVTypeConverter typeConverter(targetEnv); OwningRewritePatternList patterns; patterns.insert(context, typeConverter); diff --git a/mlir/lib/Dialect/SPIRV/Transforms/UpdateVCEPass.cpp b/mlir/lib/Dialect/SPIRV/Transforms/UpdateVCEPass.cpp index fff15c18574909..201adbbd38374f 100644 --- a/mlir/lib/Dialect/SPIRV/Transforms/UpdateVCEPass.cpp +++ b/mlir/lib/Dialect/SPIRV/Transforms/UpdateVCEPass.cpp @@ -34,22 +34,18 @@ class UpdateVCEPass final } // namespace /// Checks that `candidates` extension requirements are possible to be satisfied -/// with the given `allowedExtensions` and updates `deducedExtensions` if so. -/// Emits errors attaching to the given `op` on failures. +/// with the given `targetEnv` and updates `deducedExtensions` if so. Emits +/// errors attaching to the given `op` on failures. /// /// `candidates` is a vector of vector for extension requirements following /// ((Extension::A OR Extension::B) AND (Extension::C OR Extension::D)) /// convention. static LogicalResult checkAndUpdateExtensionRequirements( - Operation *op, const llvm::SmallSet &allowedExtensions, + Operation *op, const spirv::TargetEnv &targetEnv, const spirv::SPIRVType::ExtensionArrayRefVector &candidates, llvm::SetVector &deducedExtensions) { for (const auto &ors : candidates) { - auto chosen = llvm::find_if(ors, [&](spirv::Extension ext) { - return allowedExtensions.count(ext); - }); - - if (chosen != ors.end()) { + if (Optional chosen = targetEnv.allows(ors)) { deducedExtensions.insert(*chosen); } else { SmallVector extStrings; @@ -66,23 +62,18 @@ static LogicalResult checkAndUpdateExtensionRequirements( } /// Checks that `candidates`capability requirements are possible to be satisfied -/// with the given `allowedCapabilities` and updates `deducedCapabilities` if -/// so. Emits errors attaching to the given `op` on failures. +/// with the given `targetEnv` and updates `deducedCapabilities` if so. Emits +/// errors attaching to the given `op` on failures. /// /// `candidates` is a vector of vector for capability requirements following /// ((Capability::A OR Capability::B) AND (Capability::C OR Capability::D)) /// convention. static LogicalResult checkAndUpdateCapabilityRequirements( - Operation *op, - const llvm::SmallSet &allowedCapabilities, + Operation *op, const spirv::TargetEnv &targetEnv, const spirv::SPIRVType::CapabilityArrayRefVector &candidates, llvm::SetVector &deducedCapabilities) { for (const auto &ors : candidates) { - auto chosen = llvm::find_if(ors, [&](spirv::Capability cap) { - return allowedCapabilities.count(cap); - }); - - if (chosen != ors.end()) { + if (Optional chosen = targetEnv.allows(ors)) { deducedCapabilities.insert(*chosen); } else { SmallVector capStrings; @@ -101,32 +92,14 @@ static LogicalResult checkAndUpdateCapabilityRequirements( void UpdateVCEPass::runOnOperation() { spirv::ModuleOp module = getOperation(); - spirv::TargetEnvAttr targetEnv = spirv::lookupTargetEnv(module); - if (!targetEnv) { + spirv::TargetEnvAttr targetAttr = spirv::lookupTargetEnv(module); + if (!targetAttr) { module.emitError("missing 'spv.target_env' attribute"); return signalPassFailure(); } - spirv::Version allowedVersion = targetEnv.getVersion(); - - // Build a set for available extensions in the target environment. - llvm::SmallSet allowedExtensions; - for (spirv::Extension ext : targetEnv.getExtensions()) - allowedExtensions.insert(ext); - - // Add extensions implied by the current version. - for (spirv::Extension ext : spirv::getImpliedExtensions(allowedVersion)) - allowedExtensions.insert(ext); - - // Build a set for available capabilities in the target environment. - llvm::SmallSet allowedCapabilities; - for (spirv::Capability cap : targetEnv.getCapabilities()) { - allowedCapabilities.insert(cap); - - // Add capabilities implied by the current capability. - for (spirv::Capability c : spirv::getRecursiveImpliedCapabilities(cap)) - allowedCapabilities.insert(c); - } + spirv::TargetEnv targetEnv(targetAttr); + spirv::Version allowedVersion = targetAttr.getVersion(); spirv::Version deducedVersion = spirv::Version::V_1_0; llvm::SetVector deducedExtensions; @@ -148,15 +121,14 @@ void UpdateVCEPass::runOnOperation() { // Op extension requirements if (auto extensions = dyn_cast(op)) - if (failed(checkAndUpdateExtensionRequirements(op, allowedExtensions, - extensions.getExtensions(), - deducedExtensions))) + if (failed(checkAndUpdateExtensionRequirements( + op, targetEnv, extensions.getExtensions(), deducedExtensions))) return WalkResult::interrupt(); // Op capability requirements if (auto capabilities = dyn_cast(op)) if (failed(checkAndUpdateCapabilityRequirements( - op, allowedCapabilities, capabilities.getCapabilities(), + op, targetEnv, capabilities.getCapabilities(), deducedCapabilities))) return WalkResult::interrupt(); @@ -176,13 +148,13 @@ void UpdateVCEPass::runOnOperation() { typeExtensions.clear(); valueType.cast().getExtensions(typeExtensions); if (failed(checkAndUpdateExtensionRequirements( - op, allowedExtensions, typeExtensions, deducedExtensions))) + op, targetEnv, typeExtensions, deducedExtensions))) return WalkResult::interrupt(); typeCapabilities.clear(); valueType.cast().getCapabilities(typeCapabilities); if (failed(checkAndUpdateCapabilityRequirements( - op, allowedCapabilities, typeCapabilities, deducedCapabilities))) + op, targetEnv, typeCapabilities, deducedCapabilities))) return WalkResult::interrupt(); } diff --git a/mlir/lib/Dialect/Vector/VectorOps.cpp b/mlir/lib/Dialect/Vector/VectorOps.cpp index 342ce37ad5157c..fba0f4af5f26ed 100644 --- a/mlir/lib/Dialect/Vector/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/VectorOps.cpp @@ -1483,6 +1483,10 @@ static void print(OpAsmPrinter &p, TypeCastOp op) { } static LogicalResult verify(TypeCastOp op) { + MemRefType canonicalType = canonicalizeStridedLayout(op.getMemRefType()); + if (!canonicalType.getAffineMaps().empty()) + return op.emitOpError("expects operand to be a memref with no layout"); + auto resultType = inferVectorTypeCastResultType(op.getMemRefType()); if (op.getResultMemRefType() != resultType) return op.emitOpError("expects result type to be: ") << resultType; diff --git a/mlir/lib/IR/Builders.cpp b/mlir/lib/IR/Builders.cpp index 82d2efb5255cf9..23536651f97496 100644 --- a/mlir/lib/IR/Builders.cpp +++ b/mlir/lib/IR/Builders.cpp @@ -109,6 +109,20 @@ DenseIntElementsAttr Builder::getI64VectorAttr(ArrayRef values) { values); } +DenseIntElementsAttr Builder::getI32TensorAttr(ArrayRef values) { + return DenseIntElementsAttr::get( + RankedTensorType::get(static_cast(values.size()), + getIntegerType(32)), + values); +} + +DenseIntElementsAttr Builder::getI64TensorAttr(ArrayRef values) { + return DenseIntElementsAttr::get( + RankedTensorType::get(static_cast(values.size()), + getIntegerType(64)), + values); +} + IntegerAttr Builder::getI32IntegerAttr(int32_t value) { return IntegerAttr::get(getIntegerType(32), APInt(32, value)); } diff --git a/mlir/lib/IR/Dialect.cpp b/mlir/lib/IR/Dialect.cpp index 4ce461e2d7d995..e48e7f64010da1 100644 --- a/mlir/lib/IR/Dialect.cpp +++ b/mlir/lib/IR/Dialect.cpp @@ -13,6 +13,7 @@ #include "mlir/IR/DialectInterface.h" #include "mlir/IR/MLIRContext.h" #include "mlir/IR/Operation.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/Regex.h" @@ -26,39 +27,40 @@ DialectAsmParser::~DialectAsmParser() {} // Dialect Registration //===----------------------------------------------------------------------===// -// Registry for all dialect allocation functions. -static llvm::ManagedStatic> +/// Registry for all dialect allocation functions. +static llvm::ManagedStatic< + llvm::MapVector> dialectRegistry; -// Registry for functions that set dialect hooks. -static llvm::ManagedStatic> +/// Registry for functions that set dialect hooks. +static llvm::ManagedStatic> dialectHooksRegistry; -/// Registers a specific dialect creation function with the system, typically -/// used through the DialectRegistration template. -void mlir::registerDialectAllocator(const DialectAllocatorFunction &function) { +void Dialect::registerDialectAllocator( + const ClassID *classId, const DialectAllocatorFunction &function) { assert(function && "Attempting to register an empty dialect initialize function"); - dialectRegistry->push_back(function); + dialectRegistry->insert({classId, function}); } /// Registers a function to set specific hooks for a specific dialect, typically /// used through the DialectHooksRegistration template. -void mlir::registerDialectHooksSetter(const DialectHooksSetter &function) { +void DialectHooks::registerDialectHooksSetter( + const ClassID *classId, const DialectHooksSetter &function) { assert( function && "Attempting to register an empty dialect hooks initialization function"); - dialectHooksRegistry->push_back(function); + dialectHooksRegistry->insert({classId, function}); } -/// Registers all dialects and their const folding hooks with the specified -/// MLIRContext. +/// Registers all dialects and hooks from the global registries with the +/// specified MLIRContext. void mlir::registerAllDialects(MLIRContext *context) { - for (const auto &fn : *dialectRegistry) - fn(context); - for (const auto &fn : *dialectHooksRegistry) { - fn(context); + for (const auto &it : *dialectRegistry) + it.second(context); + for (const auto &it : *dialectHooksRegistry) { + it.second(context); } } diff --git a/mlir/lib/IR/StandardTypes.cpp b/mlir/lib/IR/StandardTypes.cpp index 488601cdb16b6b..1e7d9f38a2ceee 100644 --- a/mlir/lib/IR/StandardTypes.cpp +++ b/mlir/lib/IR/StandardTypes.cpp @@ -86,6 +86,8 @@ bool Type::isSignlessIntOrFloat() { bool Type::isIntOrFloat() { return isa() || isa(); } +bool Type::isIntOrIndexOrFloat() { return isIntOrFloat() || isIndex(); } + //===----------------------------------------------------------------------===// // Integer Type //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp index 86351bd689ad9b..8a71762e956dbb 100644 --- a/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp +++ b/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp @@ -60,6 +60,8 @@ class Importer { GlobalOp processGlobal(llvm::GlobalVariable *GV); private: + /// Returns personality of `f` as a FlatSymbolRefAttr. + FlatSymbolRefAttr getPersonalityAsAttr(llvm::Function *f); /// Imports `bb` into `block`, which must be initially empty. LogicalResult processBasicBlock(llvm::BasicBlock *bb, Block *block); /// Imports `inst` and populates instMap[inst] with the imported Value. @@ -471,7 +473,7 @@ static const DenseMap opcMap = { // FIXME: switch // FIXME: indirectbr // FIXME: invoke - // FIXME: resume + INST(Resume, Resume), // FIXME: unreachable // FIXME: cleanupret // FIXME: catchret @@ -604,6 +606,7 @@ LogicalResult Importer::processInstruction(llvm::Instruction *inst) { case llvm::Instruction::Load: case llvm::Instruction::Store: case llvm::Instruction::Ret: + case llvm::Instruction::Resume: case llvm::Instruction::Trunc: case llvm::Instruction::ZExt: case llvm::Instruction::SExt: @@ -726,8 +729,11 @@ LogicalResult Importer::processInstruction(llvm::Instruction *inst) { for (unsigned i = 0, ie = lpi->getNumClauses(); i < ie; i++) ops.push_back(processConstant(lpi->getClause(i))); - b.create(loc, processType(lpi->getType()), lpi->isCleanup(), - ops); + Type ty = processType(lpi->getType()); + if (!ty) + return failure(); + + v = b.create(loc, ty, lpi->isCleanup(), ops); return success(); } case llvm::Instruction::Invoke: { @@ -798,6 +804,28 @@ LogicalResult Importer::processInstruction(llvm::Instruction *inst) { } } +FlatSymbolRefAttr Importer::getPersonalityAsAttr(llvm::Function *f) { + if (!f->hasPersonalityFn()) + return nullptr; + + llvm::Constant *pf = f->getPersonalityFn(); + + // If it directly has a name, we can use it. + if (pf->hasName()) + return b.getSymbolRefAttr(pf->getName()); + + // If it doesn't have a name, currently, only function pointers that are + // bitcast to i8* are parsed. + if (auto ce = dyn_cast(pf)) { + if (ce->getOpcode() == llvm::Instruction::BitCast && + ce->getType() == llvm::Type::getInt8PtrTy(dialect->getLLVMContext())) { + if (auto func = dyn_cast(ce->getOperand(0))) + return b.getSymbolRefAttr(func->getName()); + } + } + return FlatSymbolRefAttr(); +} + LogicalResult Importer::processFunction(llvm::Function *f) { blocks.clear(); instMap.clear(); @@ -810,6 +838,13 @@ LogicalResult Importer::processFunction(llvm::Function *f) { b.setInsertionPoint(module.getBody(), getFuncInsertPt()); LLVMFuncOp fop = b.create(UnknownLoc::get(context), f->getName(), functionType); + + if (FlatSymbolRefAttr personality = getPersonalityAsAttr(f)) + fop.setAttr(b.getIdentifier("personality"), personality); + else if (f->hasPersonalityFn()) + emitWarning(UnknownLoc::get(context), + "could not deduce personality, skipping it"); + if (f->isDeclaration()) return success(); diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp index 2c3a68fa108169..8bc76870e8f700 100644 --- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp @@ -99,7 +99,8 @@ llvm::Constant *ModuleTranslation::getLLVMConstant(llvm::Type *llvmType, if (auto floatAttr = attr.dyn_cast()) return llvm::ConstantFP::get(llvmType, floatAttr.getValue()); if (auto funcAttr = attr.dyn_cast()) - return functionMapping.lookup(funcAttr.getValue()); + return llvm::ConstantExpr::getBitCast( + functionMapping.lookup(funcAttr.getValue()), llvmType); if (auto splatAttr = attr.dyn_cast()) { auto *sequentialType = cast(llvmType); auto elementType = sequentialType->getElementType(); @@ -353,6 +354,7 @@ LogicalResult ModuleTranslation::convertOperation(Operation &opInst, if (auto constOperand = dyn_cast(operand)) lpi->addClause(constOperand); } + valueMapping[lpOp.getResult()] = lpi; return success(); } @@ -470,7 +472,8 @@ LogicalResult ModuleTranslation::convertGlobals() { auto linkage = convertLinkageToLLVM(op.linkage()); bool anyExternalLinkage = - (linkage == llvm::GlobalVariable::ExternalLinkage || + ((linkage == llvm::GlobalVariable::ExternalLinkage && + isa(cst)) || linkage == llvm::GlobalVariable::ExternalWeakLinkage); auto addrSpace = op.addr_space().getLimitedValue(); auto *var = new llvm::GlobalVariable( @@ -584,6 +587,14 @@ LogicalResult ModuleTranslation::convertOneFunction(LLVMFuncOp func) { argIdx++; } + // Check the personality and set it. + if (func.personality().hasValue()) { + llvm::Type *ty = llvm::Type::getInt8PtrTy(llvmFunc->getContext()); + if (llvm::Constant *pfunc = + getLLVMConstant(ty, func.personalityAttr(), func.getLoc())) + llvmFunc->setPersonalityFn(pfunc); + } + // First, create all blocks so we can jump to them. llvm::LLVMContext &llvmContext = llvmFunc->getContext(); for (auto &bb : func) { @@ -645,8 +656,10 @@ SmallVector ModuleTranslation::lookupValues(ValueRange values) { SmallVector remapped; remapped.reserve(values.size()); - for (Value v : values) + for (Value v : values) { + assert(valueMapping.count(v) && "referencing undefined value"); remapped.push_back(valueMapping.lookup(v)); + } return remapped; } diff --git a/mlir/lib/Transforms/Inliner.cpp b/mlir/lib/Transforms/Inliner.cpp index b6fcf8bc3941cc..ea48582dc52a93 100644 --- a/mlir/lib/Transforms/Inliner.cpp +++ b/mlir/lib/Transforms/Inliner.cpp @@ -14,8 +14,8 @@ //===----------------------------------------------------------------------===// #include "mlir/Analysis/CallGraph.h" -#include "mlir/IR/Builders.h" #include "mlir/IR/PatternMatch.h" +#include "mlir/Interfaces/SideEffects.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/InliningUtils.h" #include "mlir/Transforms/Passes.h" @@ -37,6 +37,259 @@ static llvm::cl::opt maxInliningIterations( llvm::cl::desc("Maximum number of iterations when inlining within an SCC"), llvm::cl::ReallyHidden, llvm::cl::init(4)); +//===----------------------------------------------------------------------===// +// Symbol Use Tracking +//===----------------------------------------------------------------------===// + +/// Returns true if this operation can be discarded if it is a symbol and has no +/// uses. 'allUsesVisible' corresponds to if the parent symbol table is hidden +/// from above. +static bool canDiscardSymbolOnUseEmpty(Operation *op, bool allUsesVisible) { + if (!SymbolTable::isSymbol(op)) + return false; + + // TODO: This is essentially the same logic from SymbolDCE. Remove this when + // we have a 'Symbol' interface. + // Private symbols are always initially considered dead. + SymbolTable::Visibility visibility = SymbolTable::getSymbolVisibility(op); + if (visibility == mlir::SymbolTable::Visibility::Private) + return true; + // We only include nested visibility here if all uses are visible. + if (allUsesVisible && visibility == SymbolTable::Visibility::Nested) + return true; + // Otherwise, public symbols are never removable. + return false; +} + +/// Walk all of the symbol table operations nested with 'op' along with a +/// boolean signifying if the symbols within can be treated as if all uses are +/// visible. The provided callback is invoked with the symbol table operation, +/// and a boolean signaling if all of the uses within the symbol table are +/// visible. +static void walkSymbolTables(Operation *op, bool allSymUsesVisible, + function_ref callback) { + if (op->hasTrait()) { + allSymUsesVisible = allSymUsesVisible || !SymbolTable::isSymbol(op) || + SymbolTable::getSymbolVisibility(op) == + SymbolTable::Visibility::Private; + callback(op, allSymUsesVisible); + } else { + // Otherwise if 'op' is not a symbol table, any nested symbols are + // guaranteed to be hidden. + allSymUsesVisible = true; + } + + for (Region ®ion : op->getRegions()) + for (Block &block : region) + for (Operation &nested : block) + walkSymbolTables(&nested, allSymUsesVisible, callback); +} + +/// Walk all of the used symbol callgraph nodes referenced with the given op. +static void walkReferencedSymbolNodes( + Operation *op, CallGraph &cg, + DenseMap &resolvedRefs, + function_ref callback) { + auto symbolUses = SymbolTable::getSymbolUses(op); + assert(symbolUses && "expected uses to be valid"); + + Operation *symbolTableOp = op->getParentOp(); + for (const SymbolTable::SymbolUse &use : *symbolUses) { + auto refIt = resolvedRefs.insert({use.getSymbolRef(), nullptr}); + CallGraphNode *&node = refIt.first->second; + + // If this is the first instance of this reference, try to resolve a + // callgraph node for it. + if (refIt.second) { + auto *symbolOp = SymbolTable::lookupNearestSymbolFrom(symbolTableOp, + use.getSymbolRef()); + auto callableOp = dyn_cast_or_null(symbolOp); + if (!callableOp) + continue; + node = cg.lookupNode(callableOp.getCallableRegion()); + } + if (node) + callback(node, use.getUser()); + } +} + +//===----------------------------------------------------------------------===// +// CGUseList + +namespace { +/// This struct tracks the uses of callgraph nodes that can be dropped when +/// use_empty. It directly tracks and manages a use-list for all of the +/// call-graph nodes. This is necessary because many callgraph nodes are +/// referenced by SymbolRefAttr, which has no mechanism akin to the SSA `Use` +/// class. +struct CGUseList { + /// This struct tracks the uses of callgraph nodes within a specific + /// operation. + struct CGUser { + /// Any nodes referenced in the top-level attribute list of this user. We + /// use a set here because the number of references does not matter. + DenseSet topLevelUses; + + /// Uses of nodes referenced by nested operations. + DenseMap innerUses; + }; + + CGUseList(Operation *op, CallGraph &cg); + + /// Drop uses of nodes referred to by the given call operation that resides + /// within 'userNode'. + void dropCallUses(CallGraphNode *userNode, Operation *callOp, CallGraph &cg); + + /// Remove the given node from the use list. + void eraseNode(CallGraphNode *node); + + /// Returns true if the given callgraph node has no uses and can be pruned. + bool isDead(CallGraphNode *node) const; + + /// Returns true if the given callgraph node has a single use and can be + /// discarded. + bool hasOneUseAndDiscardable(CallGraphNode *node) const; + + /// Recompute the uses held by the given callgraph node. + void recomputeUses(CallGraphNode *node, CallGraph &cg); + + /// Merge the uses of 'lhs' with the uses of the 'rhs' after inlining a copy + /// of 'lhs' into 'rhs'. + void mergeUsesAfterInlining(CallGraphNode *lhs, CallGraphNode *rhs); + +private: + /// Decrement the uses of discardable nodes referenced by the given user. + void decrementDiscardableUses(CGUser &uses); + + /// A mapping between a discardable callgraph node (that is a symbol) and the + /// number of uses for this node. + DenseMap discardableSymNodeUses; + /// A mapping between a callgraph node and the symbol callgraph nodes that it + /// uses. + DenseMap nodeUses; +}; +} // end anonymous namespace + +CGUseList::CGUseList(Operation *op, CallGraph &cg) { + /// A set of callgraph nodes that are always known to be live during inlining. + DenseMap alwaysLiveNodes; + + // Walk each of the symbol tables looking for discardable callgraph nodes. + auto walkFn = [&](Operation *symbolTableOp, bool allUsesVisible) { + for (Block &block : symbolTableOp->getRegion(0)) { + for (Operation &op : block) { + // If this is a callgraph operation, check to see if it is discardable. + if (auto callable = dyn_cast(&op)) { + if (auto *node = cg.lookupNode(callable.getCallableRegion())) { + if (canDiscardSymbolOnUseEmpty(&op, allUsesVisible)) + discardableSymNodeUses.try_emplace(node, 0); + continue; + } + } + // Otherwise, check for any referenced nodes. These will be always-live. + walkReferencedSymbolNodes(&op, cg, alwaysLiveNodes, + [](CallGraphNode *, Operation *) {}); + } + } + }; + walkSymbolTables(op, /*allSymUsesVisible=*/!op->getBlock(), walkFn); + + // Drop the use information for any discardable nodes that are always live. + for (auto &it : alwaysLiveNodes) + discardableSymNodeUses.erase(it.second); + + // Compute the uses for each of the callable nodes in the graph. + for (CallGraphNode *node : cg) + recomputeUses(node, cg); +} + +void CGUseList::dropCallUses(CallGraphNode *userNode, Operation *callOp, + CallGraph &cg) { + auto &userRefs = nodeUses[userNode].innerUses; + auto walkFn = [&](CallGraphNode *node, Operation *user) { + auto parentIt = userRefs.find(node); + if (parentIt == userRefs.end()) + return; + --parentIt->second; + --discardableSymNodeUses[node]; + }; + DenseMap resolvedRefs; + walkReferencedSymbolNodes(callOp, cg, resolvedRefs, walkFn); +} + +void CGUseList::eraseNode(CallGraphNode *node) { + // Drop all child nodes. + for (auto &edge : *node) + if (edge.isChild()) + eraseNode(edge.getTarget()); + + // Drop the uses held by this node and erase it. + auto useIt = nodeUses.find(node); + assert(useIt != nodeUses.end() && "expected node to be valid"); + decrementDiscardableUses(useIt->getSecond()); + nodeUses.erase(useIt); + discardableSymNodeUses.erase(node); +} + +bool CGUseList::isDead(CallGraphNode *node) const { + // If the parent operation isn't a symbol, simply check normal SSA deadness. + Operation *nodeOp = node->getCallableRegion()->getParentOp(); + if (!SymbolTable::isSymbol(nodeOp)) + return MemoryEffectOpInterface::hasNoEffect(nodeOp) && nodeOp->use_empty(); + + // Otherwise, check the number of symbol uses. + auto symbolIt = discardableSymNodeUses.find(node); + return symbolIt != discardableSymNodeUses.end() && symbolIt->second == 0; +} + +bool CGUseList::hasOneUseAndDiscardable(CallGraphNode *node) const { + // If this isn't a symbol node, check for side-effects and SSA use count. + Operation *nodeOp = node->getCallableRegion()->getParentOp(); + if (!SymbolTable::isSymbol(nodeOp)) + return MemoryEffectOpInterface::hasNoEffect(nodeOp) && nodeOp->hasOneUse(); + + // Otherwise, check the number of symbol uses. + auto symbolIt = discardableSymNodeUses.find(node); + return symbolIt != discardableSymNodeUses.end() && symbolIt->second == 1; +} + +void CGUseList::recomputeUses(CallGraphNode *node, CallGraph &cg) { + Operation *parentOp = node->getCallableRegion()->getParentOp(); + CGUser &uses = nodeUses[node]; + decrementDiscardableUses(uses); + + // Collect the new discardable uses within this node. + uses = CGUser(); + DenseMap resolvedRefs; + auto walkFn = [&](CallGraphNode *refNode, Operation *user) { + auto discardSymIt = discardableSymNodeUses.find(refNode); + if (discardSymIt == discardableSymNodeUses.end()) + return; + + if (user != parentOp) + ++uses.innerUses[refNode]; + else if (!uses.topLevelUses.insert(refNode).second) + return; + ++discardSymIt->second; + }; + walkReferencedSymbolNodes(parentOp, cg, resolvedRefs, walkFn); +} + +void CGUseList::mergeUsesAfterInlining(CallGraphNode *lhs, CallGraphNode *rhs) { + auto &lhsUses = nodeUses[lhs], &rhsUses = nodeUses[rhs]; + for (auto &useIt : lhsUses.innerUses) { + rhsUses.innerUses[useIt.first] += useIt.second; + discardableSymNodeUses[useIt.first] += useIt.second; + } +} + +void CGUseList::decrementDiscardableUses(CGUser &uses) { + for (CallGraphNode *node : uses.topLevelUses) + --discardableSymNodeUses[node]; + for (auto &it : uses.innerUses) + discardableSymNodeUses[it.first] -= it.second; +} + //===----------------------------------------------------------------------===// // CallGraph traversal //===----------------------------------------------------------------------===// @@ -45,7 +298,7 @@ static llvm::cl::opt maxInliningIterations( /// traversal. static void runTransformOnCGSCCs( const CallGraph &cg, - function_ref)> sccTransformer) { + function_ref)> sccTransformer) { std::vector currentSCCVec; auto cgi = llvm::scc_begin(&cg); while (!cgi.isAtEnd()) { @@ -63,10 +316,11 @@ namespace { /// Region(CallGraphNode) that it is dispatching to, we need to resolve them /// explicitly. struct ResolvedCall { - ResolvedCall(CallOpInterface call, CallGraphNode *targetNode) - : call(call), targetNode(targetNode) {} + ResolvedCall(CallOpInterface call, CallGraphNode *sourceNode, + CallGraphNode *targetNode) + : call(call), sourceNode(sourceNode), targetNode(targetNode) {} CallOpInterface call; - CallGraphNode *targetNode; + CallGraphNode *sourceNode, *targetNode; }; } // end anonymous namespace @@ -74,17 +328,22 @@ struct ResolvedCall { /// `traverseNestedCGNodes` is true, this will also collect call operations /// inside of nested callgraph nodes. static void collectCallOps(iterator_range blocks, - CallGraph &cg, SmallVectorImpl &calls, + CallGraphNode *sourceNode, CallGraph &cg, + SmallVectorImpl &calls, bool traverseNestedCGNodes) { - SmallVector worklist; - auto addToWorklist = [&](iterator_range blocks) { + SmallVector, 8> worklist; + auto addToWorklist = [&](CallGraphNode *node, + iterator_range blocks) { for (Block &block : blocks) - worklist.push_back(&block); + worklist.emplace_back(&block, node); }; - addToWorklist(blocks); + addToWorklist(sourceNode, blocks); while (!worklist.empty()) { - for (Operation &op : *worklist.pop_back_val()) { + Block *block; + std::tie(block, sourceNode) = worklist.pop_back_val(); + + for (Operation &op : *block) { if (auto call = dyn_cast(op)) { // TODO(riverriddle) Support inlining nested call references. CallInterfaceCallable callable = call.getCallableForCallee(); @@ -93,18 +352,20 @@ static void collectCallOps(iterator_range blocks, continue; } - CallGraphNode *node = cg.resolveCallable(call); - if (!node->isExternal()) - calls.emplace_back(call, node); + CallGraphNode *targetNode = cg.resolveCallable(call); + if (!targetNode->isExternal()) + calls.emplace_back(call, sourceNode, targetNode); continue; } // If this is not a call, traverse the nested regions. If // `traverseNestedCGNodes` is false, then don't traverse nested call graph // regions. - for (auto &nestedRegion : op.getRegions()) - if (traverseNestedCGNodes || !cg.lookupNode(&nestedRegion)) - addToWorklist(nestedRegion); + for (auto &nestedRegion : op.getRegions()) { + CallGraphNode *nestedNode = cg.lookupNode(&nestedRegion); + if (traverseNestedCGNodes || !nestedNode) + addToWorklist(nestedNode ? nestedNode : sourceNode, nestedRegion); + } } } } @@ -122,7 +383,16 @@ struct Inliner : public InlinerInterface { /// *before* inlined terminator operations have been processed. void processInlinedBlocks(iterator_range inlinedBlocks) final { - collectCallOps(inlinedBlocks, cg, calls, /*traverseNestedCGNodes=*/true); + // Find the closest callgraph node from the first block. + CallGraphNode *node; + Region *region = inlinedBlocks.begin()->getParent(); + while (!(node = cg.lookupNode(region))) { + region = region->getParentRegion(); + assert(region && "expected valid parent node"); + } + + collectCallOps(inlinedBlocks, node, cg, calls, + /*traverseNestedCGNodes=*/true); } /// The current set of call instructions to consider for inlining. @@ -150,24 +420,47 @@ static bool shouldInline(ResolvedCall &resolvedCall) { return true; } +/// Delete the given node and remove it from the current scc and the callgraph. +static void deleteNode(CallGraphNode *node, CGUseList &useList, CallGraph &cg, + MutableArrayRef currentSCC) { + // Erase the parent operation and remove it from the various lists. + node->getCallableRegion()->getParentOp()->erase(); + cg.eraseNode(node); + + // Replace this node in the currentSCC with the external node. + auto it = llvm::find(currentSCC, node); + if (it != currentSCC.end()) + *it = cg.getExternalNode(); +} + /// Attempt to inline calls within the given scc. This function returns /// success if any calls were inlined, failure otherwise. -static LogicalResult inlineCallsInSCC(Inliner &inliner, - ArrayRef currentSCC) { +static LogicalResult +inlineCallsInSCC(Inliner &inliner, CGUseList &useList, + MutableArrayRef currentSCC) { CallGraph &cg = inliner.cg; auto &calls = inliner.calls; // Collect all of the direct calls within the nodes of the current SCC. We // don't traverse nested callgraph nodes, because they are handled separately // likely within a different SCC. - for (auto *node : currentSCC) { - if (!node->isExternal()) - collectCallOps(*node->getCallableRegion(), cg, calls, + for (CallGraphNode *node : currentSCC) { + if (node->isExternal()) + continue; + + // If this node is dead, just delete it now. + if (useList.isDead(node)) + deleteNode(node, useList, cg, currentSCC); + else + collectCallOps(*node->getCallableRegion(), node, cg, calls, /*traverseNestedCGNodes=*/false); } if (calls.empty()) return failure(); + // A set of dead nodes to remove after inlining. + SmallVector deadNodes; + // Try to inline each of the call operations. Don't cache the end iterator // here as more calls may be added during inlining. bool inlinedAnyCalls = false; @@ -179,26 +472,44 @@ static LogicalResult inlineCallsInSCC(Inliner &inliner, }); if (!shouldInline(it)) continue; - CallOpInterface call = it.call; Region *targetRegion = it.targetNode->getCallableRegion(); + + // If this is the last call to the target node and the node is discardable, + // then inline it in-place and delete the node if successful. + bool inlineInPlace = useList.hasOneUseAndDiscardable(it.targetNode); + LogicalResult inlineResult = inlineCall( inliner, call, cast(targetRegion->getParentOp()), - targetRegion); + targetRegion, /*shouldCloneInlinedRegion=*/!inlineInPlace); if (failed(inlineResult)) continue; + inlinedAnyCalls = true; + + // If the inlining was successful, Merge the new uses into the source node. + useList.dropCallUses(it.sourceNode, call.getOperation(), cg); + useList.mergeUsesAfterInlining(it.targetNode, it.sourceNode); - // If the inlining was successful, then erase the call. + // then erase the call. call.erase(); - inlinedAnyCalls = true; + + // If we inlined in place, mark the node for deletion. + if (inlineInPlace) { + useList.eraseNode(it.targetNode); + deadNodes.push_back(it.targetNode); + } } + + for (CallGraphNode *node : deadNodes) + deleteNode(node, useList, cg, currentSCC); calls.clear(); return success(inlinedAnyCalls); } /// Canonicalize the nodes within the given SCC with the given set of /// canonicalization patterns. -static void canonicalizeSCC(CallGraph &cg, ArrayRef currentSCC, +static void canonicalizeSCC(CallGraph &cg, CGUseList &useList, + MutableArrayRef currentSCC, MLIRContext *context, const OwningRewritePatternList &canonPatterns) { // Collect the sets of nodes to canonicalize. @@ -246,12 +557,17 @@ static void canonicalizeSCC(CallGraph &cg, ArrayRef currentSCC, // thread may be used in a different context. canonicalizationHandler.eraseOrderIDForThread(); }); + + // Recompute the uses held by each of the nodes. + for (CallGraphNode *node : nodesToCanonicalize) + useList.recomputeUses(node, cg); } /// Attempt to inline calls within the given scc, and run canonicalizations with /// the given patterns, until a fixed point is reached. This allows for the /// inlining of newly devirtualized calls. -static void inlineSCC(Inliner &inliner, ArrayRef currentSCC, +static void inlineSCC(Inliner &inliner, CGUseList &useList, + MutableArrayRef currentSCC, MLIRContext *context, const OwningRewritePatternList &canonPatterns) { // If we successfully inlined any calls, run some simplifications on the @@ -259,12 +575,12 @@ static void inlineSCC(Inliner &inliner, ArrayRef currentSCC, // point, or a maximum iteration count. We canonicalize here as it may // devirtualize new calls, as well as give us a better cost model. unsigned iterationCount = 0; - while (succeeded(inlineCallsInSCC(inliner, currentSCC))) { + while (succeeded(inlineCallsInSCC(inliner, useList, currentSCC))) { // If we aren't allowing simplifications or the max iteration count was // reached, then bail out early. if (disableCanonicalization || ++iterationCount >= maxInliningIterations) break; - canonicalizeSCC(inliner.cg, currentSCC, context, canonPatterns); + canonicalizeSCC(inliner.cg, useList, currentSCC, context, canonPatterns); } } @@ -272,8 +588,6 @@ static void inlineSCC(Inliner &inliner, ArrayRef currentSCC, // InlinerPass //===----------------------------------------------------------------------===// -// TODO(riverriddle) This pass should currently only be used for basic testing -// of inlining functionality. namespace { struct InlinerPass : public OperationPass { void runOnOperation() override { @@ -297,8 +611,9 @@ struct InlinerPass : public OperationPass { // Run the inline transform in post-order over the SCCs in the callgraph. Inliner inliner(context, cg); - runTransformOnCGSCCs(cg, [&](ArrayRef scc) { - inlineSCC(inliner, scc, context, canonPatterns); + CGUseList useList(getOperation(), cg); + runTransformOnCGSCCs(cg, [&](MutableArrayRef scc) { + inlineSCC(inliner, useList, scc, context, canonPatterns); }); } }; diff --git a/mlir/test/Conversion/GPUToSPIRV/if.mlir b/mlir/test/Conversion/GPUToSPIRV/if.mlir index 1585c53116c5d3..8a8aa1c8881338 100644 --- a/mlir/test/Conversion/GPUToSPIRV/if.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/if.mlir @@ -1,6 +1,12 @@ // RUN: mlir-opt -convert-gpu-to-spirv %s -o - | FileCheck %s -module attributes {gpu.container_module} { +module attributes { + gpu.container_module, + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { func @main(%arg0 : memref<10xf32>, %arg1 : i1) { %c0 = constant 1 : index "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0, %arg0, %arg1) { kernel = "kernel_simple_selection", kernel_module = @kernels} : (index, index, index, index, index, index, memref<10xf32>, i1) -> () diff --git a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir index 6588de87005755..05c9d90c498c23 100644 --- a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir @@ -1,6 +1,12 @@ // RUN: mlir-opt -convert-gpu-to-spirv %s -o - | FileCheck %s -module attributes {gpu.container_module} { +module attributes { + gpu.container_module, + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { func @load_store(%arg0: memref<12x4xf32>, %arg1: memref<12x4xf32>, %arg2: memref<12x4xf32>) { %c0 = constant 0 : index %c12 = constant 12 : index @@ -21,9 +27,9 @@ module attributes {gpu.container_module} { // CHECK-DAG: spv.globalVariable [[LOCALINVOCATIONIDVAR:@.*]] built_in("LocalInvocationId") : !spv.ptr, Input> // CHECK-DAG: spv.globalVariable [[WORKGROUPIDVAR:@.*]] built_in("WorkgroupId") : !spv.ptr, Input> // CHECK-LABEL: spv.func @load_store_kernel - // CHECK-SAME: [[ARG0:%.*]]: !spv.ptr [0]>, StorageBuffer> {spv.interface_var_abi = {binding = 0 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}} - // CHECK-SAME: [[ARG1:%.*]]: !spv.ptr [0]>, StorageBuffer> {spv.interface_var_abi = {binding = 1 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}} - // CHECK-SAME: [[ARG2:%.*]]: !spv.ptr [0]>, StorageBuffer> {spv.interface_var_abi = {binding = 2 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}} + // CHECK-SAME: [[ARG0:%.*]]: !spv.ptr [0]>, StorageBuffer> {spv.interface_var_abi = {binding = 0 : i32, descriptor_set = 0 : i32{{[}][}]}} + // CHECK-SAME: [[ARG1:%.*]]: !spv.ptr [0]>, StorageBuffer> {spv.interface_var_abi = {binding = 1 : i32, descriptor_set = 0 : i32{{[}][}]}} + // CHECK-SAME: [[ARG2:%.*]]: !spv.ptr [0]>, StorageBuffer> {spv.interface_var_abi = {binding = 2 : i32, descriptor_set = 0 : i32{{[}][}]}} // CHECK-SAME: [[ARG3:%.*]]: i32 {spv.interface_var_abi = {binding = 3 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}} // CHECK-SAME: [[ARG4:%.*]]: i32 {spv.interface_var_abi = {binding = 4 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}} // CHECK-SAME: [[ARG5:%.*]]: i32 {spv.interface_var_abi = {binding = 5 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}} diff --git a/mlir/test/Conversion/GPUToSPIRV/loop.mlir b/mlir/test/Conversion/GPUToSPIRV/loop.mlir index 7044d5474d3c5b..8adc5e355f0875 100644 --- a/mlir/test/Conversion/GPUToSPIRV/loop.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/loop.mlir @@ -1,6 +1,12 @@ // RUN: mlir-opt -convert-gpu-to-spirv %s -o - | FileCheck %s -module attributes {gpu.container_module} { +module attributes { + gpu.container_module, + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { func @loop(%arg0 : memref<10xf32>, %arg1 : memref<10xf32>) { %c0 = constant 1 : index "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0, %arg0, %arg1) { kernel = "loop_kernel", kernel_module = @kernels} : (index, index, index, index, index, index, memref<10xf32>, memref<10xf32>) -> () diff --git a/mlir/test/Conversion/GPUToSPIRV/simple.mlir b/mlir/test/Conversion/GPUToSPIRV/simple.mlir index d9b32a6e571b7c..3076cd04b9fe12 100644 --- a/mlir/test/Conversion/GPUToSPIRV/simple.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/simple.mlir @@ -5,7 +5,7 @@ module attributes {gpu.container_module} { // CHECK: spv.module Logical GLSL450 { // CHECK-LABEL: spv.func @basic_module_structure // CHECK-SAME: {{%.*}}: f32 {spv.interface_var_abi = {binding = 0 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}} - // CHECK-SAME: {{%.*}}: !spv.ptr [0]>, StorageBuffer> {spv.interface_var_abi = {binding = 1 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}} + // CHECK-SAME: {{%.*}}: !spv.ptr [0]>, StorageBuffer> {spv.interface_var_abi = {binding = 1 : i32, descriptor_set = 0 : i32{{[}][}]}} // CHECK-SAME: spv.entry_point_abi = {local_size = dense<[32, 4, 1]> : vector<3xi32>} gpu.func @basic_module_structure(%arg0 : f32, %arg1 : memref<12xf32>) attributes {gpu.kernel, spv.entry_point_abi = {local_size = dense<[32, 4, 1]>: vector<3xi32>}} { diff --git a/mlir/test/Conversion/StandardToSPIRV/std-to-spirv.mlir b/mlir/test/Conversion/StandardToSPIRV/std-to-spirv.mlir index 341df27460a0b3..cb5873a1baf04d 100644 --- a/mlir/test/Conversion/StandardToSPIRV/std-to-spirv.mlir +++ b/mlir/test/Conversion/StandardToSPIRV/std-to-spirv.mlir @@ -1,105 +1,142 @@ -// RUN: mlir-opt -convert-std-to-spirv %s -o - | FileCheck %s +// RUN: mlir-opt -split-input-file -convert-std-to-spirv %s -o - | FileCheck %s //===----------------------------------------------------------------------===// -// std binary arithmetic ops +// std arithmetic ops //===----------------------------------------------------------------------===// -// CHECK-LABEL: @add_sub -func @add_sub(%arg0 : i32, %arg1 : i32) { - // CHECK: spv.IAdd - %0 = addi %arg0, %arg1 : i32 - // CHECK: spv.ISub - %1 = subi %arg0, %arg1 : i32 +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// Check integer operation conversions. +// CHECK-LABEL: @int32_scalar +func @int32_scalar(%lhs: i32, %rhs: i32) { + // CHECK: spv.IAdd %{{.*}}, %{{.*}}: i32 + %0 = addi %lhs, %rhs: i32 + // CHECK: spv.ISub %{{.*}}, %{{.*}}: i32 + %1 = subi %lhs, %rhs: i32 + // CHECK: spv.IMul %{{.*}}, %{{.*}}: i32 + %2 = muli %lhs, %rhs: i32 + // CHECK: spv.SDiv %{{.*}}, %{{.*}}: i32 + %3 = divi_signed %lhs, %rhs: i32 + // CHECK: spv.SRem %{{.*}}, %{{.*}}: i32 + %4 = remi_signed %lhs, %rhs: i32 + // CHECK: spv.UDiv %{{.*}}, %{{.*}}: i32 + %5 = divi_unsigned %lhs, %rhs: i32 + // CHECK: spv.UMod %{{.*}}, %{{.*}}: i32 + %6 = remi_unsigned %lhs, %rhs: i32 return } -// CHECK-LABEL: @fadd_scalar -func @fadd_scalar(%arg: f32) { - // CHECK: spv.FAdd - %0 = addf %arg, %arg : f32 +// Check float operation conversions. +// CHECK-LABEL: @float32_scalar +func @float32_scalar(%lhs: f32, %rhs: f32) { + // CHECK: spv.FAdd %{{.*}}, %{{.*}}: f32 + %0 = addf %lhs, %rhs: f32 + // CHECK: spv.FSub %{{.*}}, %{{.*}}: f32 + %1 = subf %lhs, %rhs: f32 + // CHECK: spv.FMul %{{.*}}, %{{.*}}: f32 + %2 = mulf %lhs, %rhs: f32 + // CHECK: spv.FDiv %{{.*}}, %{{.*}}: f32 + %3 = divf %lhs, %rhs: f32 + // CHECK: spv.FRem %{{.*}}, %{{.*}}: f32 + %4 = remf %lhs, %rhs: f32 return } -// CHECK-LABEL: @fdiv_scalar -func @fdiv_scalar(%arg: f32) { - // CHECK: spv.FDiv - %0 = divf %arg, %arg : f32 +// Check int vector types. +// CHECK-LABEL: @int_vector234 +func @int_vector234(%arg0: vector<2xi8>, %arg1: vector<3xi16>, %arg2: vector<4xi64>) { + // CHECK: spv.SDiv %{{.*}}, %{{.*}}: vector<2xi8> + %0 = divi_signed %arg0, %arg0: vector<2xi8> + // CHECK: spv.SRem %{{.*}}, %{{.*}}: vector<3xi16> + %1 = remi_signed %arg1, %arg1: vector<3xi16> + // CHECK: spv.UDiv %{{.*}}, %{{.*}}: vector<4xi64> + %2 = divi_unsigned %arg2, %arg2: vector<4xi64> return } -// CHECK-LABEL: @fmul_scalar -func @fmul_scalar(%arg: f32) { - // CHECK: spv.FMul - %0 = mulf %arg, %arg : f32 +// Check float vector types. +// CHECK-LABEL: @float_vector234 +func @float_vector234(%arg0: vector<2xf16>, %arg1: vector<3xf64>) { + // CHECK: spv.FAdd %{{.*}}, %{{.*}}: vector<2xf16> + %0 = addf %arg0, %arg0: vector<2xf16> + // CHECK: spv.FMul %{{.*}}, %{{.*}}: vector<3xf64> + %1 = mulf %arg1, %arg1: vector<3xf64> return } -// CHECK-LABEL: @fmul_vector2 -func @fmul_vector2(%arg: vector<2xf32>) { - // CHECK: spv.FMul - %0 = mulf %arg, %arg : vector<2xf32> +// CHECK-LABEL: @unsupported_1elem_vector +func @unsupported_1elem_vector(%arg0: vector<1xi32>) { + // CHECK: addi + %0 = addi %arg0, %arg0: vector<1xi32> return } -// CHECK-LABEL: @fmul_vector3 -func @fmul_vector3(%arg: vector<3xf32>) { - // CHECK: spv.FMul - %0 = mulf %arg, %arg : vector<3xf32> +// CHECK-LABEL: @unsupported_5elem_vector +func @unsupported_5elem_vector(%arg0: vector<5xi32>) { + // CHECK: subi + %1 = subi %arg0, %arg0: vector<5xi32> return } -// CHECK-LABEL: @fmul_vector4 -func @fmul_vector4(%arg: vector<4xf32>) { - // CHECK: spv.FMul - %0 = mulf %arg, %arg : vector<4xf32> +// CHECK-LABEL: @unsupported_2x2elem_vector +func @unsupported_2x2elem_vector(%arg0: vector<2x2xi32>) { + // CHECK: muli + %2 = muli %arg0, %arg0: vector<2x2xi32> return } -// CHECK-LABEL: @fmul_vector5 -func @fmul_vector5(%arg: vector<5xf32>) { - // Vector length of only 2, 3, and 4 is valid for SPIR-V. - // CHECK: mulf - %0 = mulf %arg, %arg : vector<5xf32> +} // end module + +// ----- + +// Check that types are converted to 32-bit when no special capabilities. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: @int_vector234 +func @int_vector234(%arg0: vector<2xi8>, %arg1: vector<3xi16>, %arg2: vector<4xi64>) { + // CHECK: spv.SDiv %{{.*}}, %{{.*}}: vector<2xi32> + %0 = divi_signed %arg0, %arg0: vector<2xi8> + // CHECK: spv.SRem %{{.*}}, %{{.*}}: vector<3xi32> + %1 = remi_signed %arg1, %arg1: vector<3xi16> + // CHECK: spv.UDiv %{{.*}}, %{{.*}}: vector<4xi32> + %2 = divi_unsigned %arg2, %arg2: vector<4xi64> return } -// TODO(antiagainst): enable this once we support converting binary ops -// needing type conversion. -// XXXXX-LABEL: @fmul_tensor -//func @fmul_tensor(%arg: tensor<4xf32>) { - // For tensors mulf cannot be lowered directly to spv.FMul. - // XXXXX: mulf - //%0 = mulf %arg, %arg : tensor<4xf32> - //return -//} - -// CHECK-LABEL: @frem_scalar -func @frem_scalar(%arg: f32) { - // CHECK: spv.FRem - %0 = remf %arg, %arg : f32 +// CHECK-LABEL: @float_scalar +func @float_scalar(%arg0: f16, %arg1: f64) { + // CHECK: spv.FAdd %{{.*}}, %{{.*}}: f32 + %0 = addf %arg0, %arg0: f16 + // CHECK: spv.FMul %{{.*}}, %{{.*}}: f32 + %1 = mulf %arg1, %arg1: f64 return } -// CHECK-LABEL: @fsub_scalar -func @fsub_scalar(%arg: f32) { - // CHECK: spv.FSub - %0 = subf %arg, %arg : f32 - return -} +} // end module -// CHECK-LABEL: @div_rem -func @div_rem(%arg0 : i32, %arg1 : i32) { - // CHECK: spv.SDiv - %0 = divi_signed %arg0, %arg1 : i32 - // CHECK: spv.SMod - %1 = remi_signed %arg0, %arg1 : i32 - return -} +// ----- //===----------------------------------------------------------------------===// // std bit ops //===----------------------------------------------------------------------===// +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + // CHECK-LABEL: @bitwise_scalar func @bitwise_scalar(%arg0 : i32, %arg1 : i32) { // CHECK: spv.BitwiseAnd @@ -122,6 +159,24 @@ func @bitwise_vector(%arg0 : vector<4xi32>, %arg1 : vector<4xi32>) { return } +// CHECK-LABEL: @logical_scalar +func @logical_scalar(%arg0 : i1, %arg1 : i1) { + // CHECK: spv.LogicalAnd + %0 = and %arg0, %arg1 : i1 + // CHECK: spv.LogicalOr + %1 = or %arg0, %arg1 : i1 + return +} + +// CHECK-LABEL: @logical_vector +func @logical_vector(%arg0 : vector<4xi1>, %arg1 : vector<4xi1>) { + // CHECK: spv.LogicalAnd + %0 = and %arg0, %arg1 : vector<4xi1> + // CHECK: spv.LogicalOr + %1 = or %arg0, %arg1 : vector<4xi1> + return +} + // CHECK-LABEL: @shift_scalar func @shift_scalar(%arg0 : i32, %arg1 : i32) { // CHECK: spv.ShiftLeftLogical @@ -206,17 +261,28 @@ func @cmpi(%arg0 : i32, %arg1 : i32) { return } +} // end module + +// ----- + //===----------------------------------------------------------------------===// // std.constant //===----------------------------------------------------------------------===// +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + // CHECK-LABEL: @constant func @constant() { // CHECK: spv.constant true %0 = constant true - // CHECK: spv.constant 42 : i64 - %1 = constant 42 - // CHECK: spv.constant {{[0-9]*\.[0-9]*e?-?[0-9]*}} : f32 + // CHECK: spv.constant 42 : i32 + %1 = constant 42 : i32 + // CHECK: spv.constant 5.000000e-01 : f32 %2 = constant 0.5 : f32 // CHECK: spv.constant dense<[2, 3]> : vector<2xi32> %3 = constant dense<[2, 3]> : vector<2xi32> @@ -237,50 +303,234 @@ func @constant() { return } -//===----------------------------------------------------------------------===// -// std logical binary operations -//===----------------------------------------------------------------------===// +// CHECK-LABEL: @constant_16bit +func @constant_16bit() { + // CHECK: spv.constant 4 : i16 + %0 = constant 4 : i16 + // CHECK: spv.constant 5.000000e+00 : f16 + %1 = constant 5.0 : f16 + // CHECK: spv.constant dense<[2, 3]> : vector<2xi16> + %2 = constant dense<[2, 3]> : vector<2xi16> + // CHECK: spv.constant dense<4.000000e+00> : tensor<5xf16> : !spv.array<5 x f16 [2]> + %3 = constant dense<4.0> : tensor<5xf16> + return +} -// CHECK-LABEL: @logical_scalar -func @logical_scalar(%arg0 : i1, %arg1 : i1) { - // CHECK: spv.LogicalAnd - %0 = and %arg0, %arg1 : i1 - // CHECK: spv.LogicalOr - %1 = or %arg0, %arg1 : i1 +// CHECK-LABEL: @constant_64bit +func @constant_64bit() { + // CHECK: spv.constant 4 : i64 + %0 = constant 4 : i64 + // CHECK: spv.constant 5.000000e+00 : f64 + %1 = constant 5.0 : f64 + // CHECK: spv.constant dense<[2, 3]> : vector<2xi64> + %2 = constant dense<[2, 3]> : vector<2xi64> + // CHECK: spv.constant dense<4.000000e+00> : tensor<5xf64> : !spv.array<5 x f64 [8]> + %3 = constant dense<4.0> : tensor<5xf64> return } -// CHECK-LABEL: @logical_vector -func @logical_vector(%arg0 : vector<4xi1>, %arg1 : vector<4xi1>) { - // CHECK: spv.LogicalAnd - %0 = and %arg0, %arg1 : vector<4xi1> - // CHECK: spv.LogicalOr - %1 = or %arg0, %arg1 : vector<4xi1> +} // end module + +// ----- + +// Check that constants are converted to 32-bit when no special capability. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: @constant_16bit +func @constant_16bit() { + // CHECK: spv.constant 4 : i32 + %0 = constant 4 : i16 + // CHECK: spv.constant 5.000000e+00 : f32 + %1 = constant 5.0 : f16 + // CHECK: spv.constant dense<[2, 3]> : vector<2xi32> + %2 = constant dense<[2, 3]> : vector<2xi16> + // CHECK: spv.constant dense<4.000000e+00> : tensor<5xf32> : !spv.array<5 x f32 [4]> + %3 = constant dense<4.0> : tensor<5xf16> + // CHECK: spv.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xf32> : !spv.array<4 x f32 [4]> + %4 = constant dense<[[1.0, 2.0], [3.0, 4.0]]> : tensor<2x2xf16> return } -//===----------------------------------------------------------------------===// -// std.fpext -//===----------------------------------------------------------------------===// +// CHECK-LABEL: @constant_64bit +func @constant_64bit() { + // CHECK: spv.constant 4 : i32 + %0 = constant 4 : i64 + // CHECK: spv.constant 5.000000e+00 : f32 + %1 = constant 5.0 : f64 + // CHECK: spv.constant dense<[2, 3]> : vector<2xi32> + %2 = constant dense<[2, 3]> : vector<2xi64> + // CHECK: spv.constant dense<4.000000e+00> : tensor<5xf32> : !spv.array<5 x f32 [4]> + %3 = constant dense<4.0> : tensor<5xf64> + // CHECK: spv.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xf32> : !spv.array<4 x f32 [4]> + %4 = constant dense<[[1.0, 2.0], [3.0, 4.0]]> : tensor<2x2xf16> + return +} -// CHECK-LABEL: @fpext -func @fpext(%arg0 : f32) { - // CHECK: spv.FConvert - %0 = std.fpext %arg0 : f32 to f64 +// CHECK-LABEL: @corner_cases +func @corner_cases() { + // CHECK: %{{.*}} = spv.constant -1 : i32 + %0 = constant 4294967295 : i64 // 2^32 - 1 + // CHECK: %{{.*}} = spv.constant 2147483647 : i32 + %1 = constant 2147483647 : i64 // 2^31 - 1 + // CHECK: %{{.*}} = spv.constant -2147483648 : i32 + %2 = constant 2147483648 : i64 // 2^31 + // CHECK: %{{.*}} = spv.constant -2147483648 : i32 + %3 = constant -2147483648 : i64 // -2^31 + + // CHECK: %{{.*}} = spv.constant -1 : i32 + %5 = constant -1 : i64 + // CHECK: %{{.*}} = spv.constant -2 : i32 + %6 = constant -2 : i64 + // CHECK: %{{.*}} = spv.constant -1 : i32 + %7 = constant -1 : index + // CHECK: %{{.*}} = spv.constant -2 : i32 + %8 = constant -2 : index + + + // CHECK: spv.constant false + %9 = constant 0 : i1 + // CHECK: spv.constant true + %10 = constant 1 : i1 + + return +} + +// CHECK-LABEL: @unsupported_cases +func @unsupported_cases() { + // CHECK: %{{.*}} = constant 4294967296 : i64 + %0 = constant 4294967296 : i64 // 2^32 + // CHECK: %{{.*}} = constant -2147483649 : i64 + %1 = constant -2147483649 : i64 // -2^31 - 1 + // CHECK: %{{.*}} = constant 1.0000000000000002 : f64 + %2 = constant 0x3FF0000000000001 : f64 // smallest number > 1 return } +} // end module + +// ----- + //===----------------------------------------------------------------------===// -// std.fptrunc +// std cast ops //===----------------------------------------------------------------------===// -// CHECK-LABEL: @fptrunc -func @fptrunc(%arg0 : f64) { - // CHECK: spv.FConvert - %0 = std.fptrunc %arg0 : f64 to f32 - return +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: @fpext1 +func @fpext1(%arg0: f16) -> f64 { + // CHECK: spv.FConvert %{{.*}} : f16 to f64 + %0 = std.fpext %arg0 : f16 to f64 + return %0 : f64 +} + +// CHECK-LABEL: @fpext2 +func @fpext2(%arg0 : f32) -> f64 { + // CHECK: spv.FConvert %{{.*}} : f32 to f64 + %0 = std.fpext %arg0 : f32 to f64 + return %0 : f64 } +// CHECK-LABEL: @fptrunc1 +func @fptrunc1(%arg0 : f64) -> f16 { + // CHECK: spv.FConvert %{{.*}} : f64 to f16 + %0 = std.fptrunc %arg0 : f64 to f16 + return %0 : f16 +} + +// CHECK-LABEL: @fptrunc2 +func @fptrunc2(%arg0: f32) -> f16 { + // CHECK: spv.FConvert %{{.*}} : f32 to f16 + %0 = std.fptrunc %arg0 : f32 to f16 + return %0 : f16 +} + +// CHECK-LABEL: @sitofp1 +func @sitofp1(%arg0 : i32) -> f32 { + // CHECK: spv.ConvertSToF %{{.*}} : i32 to f32 + %0 = std.sitofp %arg0 : i32 to f32 + return %0 : f32 +} + +// CHECK-LABEL: @sitofp2 +func @sitofp2(%arg0 : i64) -> f64 { + // CHECK: spv.ConvertSToF %{{.*}} : i64 to f64 + %0 = std.sitofp %arg0 : i64 to f64 + return %0 : f64 +} + +} // end module + +// ----- + +// Checks that cast types will be adjusted when no special capabilities for +// non-32-bit scalar types. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: @fpext1 +// CHECK-SAME: %[[ARG:.*]]: f32 +func @fpext1(%arg0: f16) { + // CHECK-NEXT: "use"(%[[ARG]]) + %0 = std.fpext %arg0 : f16 to f64 + "use"(%0) : (f64) -> () +} + +// CHECK-LABEL: @fpext2 +// CHECK-SAME: %[[ARG:.*]]: f32 +func @fpext2(%arg0 : f32) { + // CHECK-NEXT: "use"(%[[ARG]]) + %0 = std.fpext %arg0 : f32 to f64 + "use"(%0) : (f64) -> () +} + +// CHECK-LABEL: @fptrunc1 +// CHECK-SAME: %[[ARG:.*]]: f32 +func @fptrunc1(%arg0 : f64) { + // CHECK-NEXT: "use"(%[[ARG]]) + %0 = std.fptrunc %arg0 : f64 to f16 + "use"(%0) : (f16) -> () +} + +// CHECK-LABEL: @fptrunc2 +// CHECK-SAME: %[[ARG:.*]]: f32 +func @fptrunc2(%arg0: f32) { + // CHECK-NEXT: "use"(%[[ARG]]) + %0 = std.fptrunc %arg0 : f32 to f16 + "use"(%0) : (f16) -> () +} + +// CHECK-LABEL: @sitofp +func @sitofp(%arg0 : i64) { + // CHECK: spv.ConvertSToF %{{.*}} : i32 to f32 + %0 = std.sitofp %arg0 : i64 to f64 + "use"(%0) : (f64) -> () +} + +} // end module + +// ----- + +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + //===----------------------------------------------------------------------===// // std.select //===----------------------------------------------------------------------===// @@ -294,25 +544,9 @@ func @select(%arg0 : i32, %arg1 : i32) { } //===----------------------------------------------------------------------===// -// std.sitofp +// std load/store ops //===----------------------------------------------------------------------===// -// CHECK-LABEL: @sitofp -func @sitofp(%arg0 : i32) { - // CHECK: spv.ConvertSToF - %0 = std.sitofp %arg0 : i32 to f32 - return -} - -//===----------------------------------------------------------------------===// -// memref type -//===----------------------------------------------------------------------===// - -// CHECK-LABEL: func @memref_type({{%.*}}: memref<3xi1>) -func @memref_type(%arg0: memref<3xi1>) { - return -} - // CHECK-LABEL: @load_store_zero_rank_float // CHECK: [[ARG0:%.*]]: !spv.ptr [0]>, StorageBuffer>, // CHECK: [[ARG1:%.*]]: !spv.ptr [0]>, StorageBuffer>) @@ -350,3 +584,5 @@ func @load_store_zero_rank_int(%arg0: memref, %arg1: memref) { store %0, %arg1[] : memref return } + +} // end module diff --git a/mlir/test/Conversion/StandardToSPIRV/std-types-to-spirv.mlir b/mlir/test/Conversion/StandardToSPIRV/std-types-to-spirv.mlir new file mode 100644 index 00000000000000..81911bd1a6332c --- /dev/null +++ b/mlir/test/Conversion/StandardToSPIRV/std-types-to-spirv.mlir @@ -0,0 +1,597 @@ +// RUN: mlir-opt -split-input-file -convert-std-to-spirv %s -o - | FileCheck %s + +//===----------------------------------------------------------------------===// +// Integer types +//===----------------------------------------------------------------------===// + +// Check that non-32-bit integer types are converted to 32-bit types if the +// corresponding capabilities are not available. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @integer8 +// CHECK-SAME: i32 +// CHECK-SAME: si32 +// CHECK-SAME: ui32 +func @integer8(%arg0: i8, %arg1: si8, %arg2: ui8) { return } + +// CHECK-LABEL: spv.func @integer16 +// CHECK-SAME: i32 +// CHECK-SAME: si32 +// CHECK-SAME: ui32 +func @integer16(%arg0: i16, %arg1: si16, %arg2: ui16) { return } + +// CHECK-LABEL: spv.func @integer64 +// CHECK-SAME: i32 +// CHECK-SAME: si32 +// CHECK-SAME: ui32 +func @integer64(%arg0: i64, %arg1: si64, %arg2: ui64) { return } + +} // end module + +// ----- + +// Check that non-32-bit integer types are kept untouched if the corresponding +// capabilities are available. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @integer8 +// CHECK-SAME: i8 +// CHECK-SAME: si8 +// CHECK-SAME: ui8 +func @integer8(%arg0: i8, %arg1: si8, %arg2: ui8) { return } + +// CHECK-LABEL: spv.func @integer16 +// CHECK-SAME: i16 +// CHECK-SAME: si16 +// CHECK-SAME: ui16 +func @integer16(%arg0: i16, %arg1: si16, %arg2: ui16) { return } + +// CHECK-LABEL: spv.func @integer64 +// CHECK-SAME: i64 +// CHECK-SAME: si64 +// CHECK-SAME: ui64 +func @integer64(%arg0: i64, %arg1: si64, %arg2: ui64) { return } + +} // end module + +// ----- + +// Check that weird bitwidths are not supported. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-NOT: spv.func @integer4 +func @integer4(%arg0: i4) { return } + +// CHECK-NOT: spv.func @integer128 +func @integer128(%arg0: i128) { return } + +// CHECK-NOT: spv.func @integer42 +func @integer42(%arg0: i42) { return } + +} // end module +// ----- + +//===----------------------------------------------------------------------===// +// Index type +//===----------------------------------------------------------------------===// + +// The index type is always converted into i32. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @index_type +// CHECK-SAME: %{{.*}}: i32 +func @index_type(%arg0: index) { return } + +} // end module + +// ----- + +//===----------------------------------------------------------------------===// +// Float types +//===----------------------------------------------------------------------===// + +// Check that non-32-bit float types are converted to 32-bit types if the +// corresponding capabilities are not available. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @float16 +// CHECK-SAME: f32 +func @float16(%arg0: f16) { return } + +// CHECK-LABEL: spv.func @float64 +// CHECK-SAME: f32 +func @float64(%arg0: f64) { return } + +} // end module + +// ----- + +// Check that non-32-bit float types are kept untouched if the corresponding +// capabilities are available. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @float16 +// CHECK-SAME: f16 +func @float16(%arg0: f16) { return } + +// CHECK-LABEL: spv.func @float64 +// CHECK-SAME: f64 +func @float64(%arg0: f64) { return } + +} // end module + +// ----- + +// Check that bf16 is not supported. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-NOT: spv.func @bf16_type +func @bf16_type(%arg0: bf16) { return } + +} // end module + +// ----- + +//===----------------------------------------------------------------------===// +// Vector types +//===----------------------------------------------------------------------===// + +// Check that capabilities for scalar types affects vector types too: no special +// capabilities available means using turning element types to 32-bit. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @int_vector +// CHECK-SAME: vector<2xi32> +// CHECK-SAME: vector<3xsi32> +// CHECK-SAME: vector<4xui32> +func @int_vector( + %arg0: vector<2xi8>, + %arg1: vector<3xsi16>, + %arg2: vector<4xui64> +) { return } + +// CHECK-LABEL: spv.func @float_vector +// CHECK-SAME: vector<2xf32> +// CHECK-SAME: vector<3xf32> +func @float_vector( + %arg0: vector<2xf16>, + %arg1: vector<3xf64> +) { return } + +} // end module + +// ----- + +// Check that capabilities for scalar types affects vector types too: having +// special capabilities means keep vector types untouched. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @int_vector +// CHECK-SAME: vector<2xi8> +// CHECK-SAME: vector<3xsi16> +// CHECK-SAME: vector<4xui64> +func @int_vector( + %arg0: vector<2xi8>, + %arg1: vector<3xsi16>, + %arg2: vector<4xui64> +) { return } + +// CHECK-LABEL: spv.func @float_vector +// CHECK-SAME: vector<2xf16> +// CHECK-SAME: vector<3xf64> +func @float_vector( + %arg0: vector<2xf16>, + %arg1: vector<3xf64> +) { return } + +} // end module + +// ----- + +// Check that 1- or > 4-element vectors are not supported. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-NOT: spv.func @one_element_vector +func @one_element_vector(%arg0: vector<1xi32>) { return } + +// CHECK-NOT: spv.func @large_vector +func @large_vector(%arg0: vector<1024xi32>) { return } + +} // end module + +// ----- + +//===----------------------------------------------------------------------===// +// MemRef types +//===----------------------------------------------------------------------===// + +// Check memory spaces. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: func @memref_mem_space +// CHECK-SAME: StorageBuffer +// CHECK-SAME: Uniform +// CHECK-SAME: Workgroup +// CHECK-SAME: PushConstant +// CHECK-SAME: Private +// CHECK-SAME: Function +func @memref_mem_space( + %arg0: memref<4xf32, 0>, + %arg1: memref<4xf32, 4>, + %arg2: memref<4xf32, 3>, + %arg3: memref<4xf32, 7>, + %arg4: memref<4xf32, 5>, + %arg5: memref<4xf32, 6> +) { return } + +} // end module + +// ----- + +// Check that boolean memref is not supported at the moment. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: func @memref_type({{%.*}}: memref<3xi1>) +func @memref_type(%arg0: memref<3xi1>) { + return +} + +} // end module + +// ----- + +// Check that using non-32-bit scalar types in interface storage classes +// requires special capability and extension: convert them to 32-bit if not +// satisfied. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @memref_8bit_StorageBuffer +// CHECK-SAME: !spv.ptr [0]>, StorageBuffer> +func @memref_8bit_StorageBuffer(%arg0: memref<16xi8, 0>) { return } + +// CHECK-LABEL: spv.func @memref_8bit_Uniform +// CHECK-SAME: !spv.ptr [0]>, Uniform> +func @memref_8bit_Uniform(%arg0: memref<16xsi8, 4>) { return } + +// CHECK-LABEL: spv.func @memref_8bit_PushConstant +// CHECK-SAME: !spv.ptr [0]>, PushConstant> +func @memref_8bit_PushConstant(%arg0: memref<16xui8, 7>) { return } + +// CHECK-LABEL: spv.func @memref_16bit_StorageBuffer +// CHECK-SAME: !spv.ptr [0]>, StorageBuffer> +func @memref_16bit_StorageBuffer(%arg0: memref<16xi16, 0>) { return } + +// CHECK-LABEL: spv.func @memref_16bit_Uniform +// CHECK-SAME: !spv.ptr [0]>, Uniform> +func @memref_16bit_Uniform(%arg0: memref<16xsi16, 4>) { return } + +// CHECK-LABEL: spv.func @memref_16bit_PushConstant +// CHECK-SAME: !spv.ptr [0]>, PushConstant> +func @memref_16bit_PushConstant(%arg0: memref<16xui16, 7>) { return } + +// CHECK-LABEL: spv.func @memref_16bit_Input +// CHECK-SAME: !spv.ptr [0]>, Input> +func @memref_16bit_Input(%arg3: memref<16xf16, 9>) { return } + +// CHECK-LABEL: spv.func @memref_16bit_Output +// CHECK-SAME: !spv.ptr [0]>, Output> +func @memref_16bit_Output(%arg4: memref<16xf16, 10>) { return } + +} // end module + +// ----- + +// Check that using non-32-bit scalar types in interface storage classes +// requires special capability and extension: keep as-is when the capability +// and extension is available. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @memref_8bit_PushConstant +// CHECK-SAME: !spv.ptr [0]>, PushConstant> +func @memref_8bit_PushConstant(%arg0: memref<16xi8, 7>) { return } + +// CHECK-LABEL: spv.func @memref_16bit_PushConstant +// CHECK-SAME: !spv.ptr [0]>, PushConstant> +// CHECK-SAME: !spv.ptr [0]>, PushConstant> +func @memref_16bit_PushConstant( + %arg0: memref<16xi16, 7>, + %arg1: memref<16xf16, 7> +) { return } + +} // end module + +// ----- + +// Check that using non-32-bit scalar types in interface storage classes +// requires special capability and extension: keep as-is when the capability +// and extension is available. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @memref_8bit_StorageBuffer +// CHECK-SAME: !spv.ptr [0]>, StorageBuffer> +func @memref_8bit_StorageBuffer(%arg0: memref<16xi8, 0>) { return } + +// CHECK-LABEL: spv.func @memref_16bit_StorageBuffer +// CHECK-SAME: !spv.ptr [0]>, StorageBuffer> +// CHECK-SAME: !spv.ptr [0]>, StorageBuffer> +func @memref_16bit_StorageBuffer( + %arg0: memref<16xi16, 0>, + %arg1: memref<16xf16, 0> +) { return } + +} // end module + +// ----- + +// Check that using non-32-bit scalar types in interface storage classes +// requires special capability and extension: keep as-is when the capability +// and extension is available. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @memref_8bit_Uniform +// CHECK-SAME: !spv.ptr [0]>, Uniform> +func @memref_8bit_Uniform(%arg0: memref<16xi8, 4>) { return } + +// CHECK-LABEL: spv.func @memref_16bit_Uniform +// CHECK-SAME: !spv.ptr [0]>, Uniform> +// CHECK-SAME: !spv.ptr [0]>, Uniform> +func @memref_16bit_Uniform( + %arg0: memref<16xi16, 4>, + %arg1: memref<16xf16, 4> +) { return } + +} // end module + +// ----- + +// Check that using non-32-bit scalar types in interface storage classes +// requires special capability and extension: keep as-is when the capability +// and extension is available. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @memref_16bit_Input +// CHECK-SAME: !spv.ptr [0]>, Input> +func @memref_16bit_Input(%arg3: memref<16xf16, 9>) { return } + +// CHECK-LABEL: spv.func @memref_16bit_Output +// CHECK-SAME: !spv.ptr [0]>, Output> +func @memref_16bit_Output(%arg4: memref<16xi16, 10>) { return } + +} // end module + +// ----- + +// Check that memref offset and strides affect the array size. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @memref_offset_strides +func @memref_offset_strides( +// CHECK-SAME: !spv.array<64 x f32 [4]> [0]>, StorageBuffer> +// CHECK-SAME: !spv.array<72 x f32 [4]> [0]>, StorageBuffer> +// CHECK-SAME: !spv.array<256 x f32 [4]> [0]>, StorageBuffer> +// CHECK-SAME: !spv.array<64 x f32 [4]> [0]>, StorageBuffer> +// CHECK-SAME: !spv.array<88 x f32 [4]> [0]>, StorageBuffer> + %arg0: memref<16x4xf32, offset: 0, strides: [4, 1]>, // tightly packed; row major + %arg1: memref<16x4xf32, offset: 8, strides: [4, 1]>, // offset 8 + %arg2: memref<16x4xf32, offset: 0, strides: [16, 1]>, // pad 12 after each row + %arg3: memref<16x4xf32, offset: 0, strides: [1, 16]>, // tightly packed; col major + %arg4: memref<16x4xf32, offset: 0, strides: [1, 22]>, // pad 4 after each col + +// CHECK-SAME: !spv.array<64 x f16 [2]> [0]>, StorageBuffer> +// CHECK-SAME: !spv.array<72 x f16 [2]> [0]>, StorageBuffer> +// CHECK-SAME: !spv.array<256 x f16 [2]> [0]>, StorageBuffer> +// CHECK-SAME: !spv.array<64 x f16 [2]> [0]>, StorageBuffer> +// CHECK-SAME: !spv.array<88 x f16 [2]> [0]>, StorageBuffer> + %arg5: memref<16x4xf16, offset: 0, strides: [4, 1]>, + %arg6: memref<16x4xf16, offset: 8, strides: [4, 1]>, + %arg7: memref<16x4xf16, offset: 0, strides: [16, 1]>, + %arg8: memref<16x4xf16, offset: 0, strides: [1, 16]>, + %arg9: memref<16x4xf16, offset: 0, strides: [1, 22]> +) { return } + +} // end module + +// ----- + +// Check that dynamic shapes are not supported. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: func @unranked_memref +// CHECK-SAME: memref<*xi32> +func @unranked_memref(%arg0: memref<*xi32>) { return } + +// CHECK-LABEL: func @dynamic_dim_memref +// CHECK-SAME: memref<8x?xi32> +func @dynamic_dim_memref(%arg0: memref<8x?xi32>) { return } + +} // end module + +// ----- + +//===----------------------------------------------------------------------===// +// Tensor types +//===----------------------------------------------------------------------===// + +// Check that tensor element types are kept untouched with proper capabilites. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @int_tensor_types +// CHECK-SAME: !spv.array<32 x i64 [8]> +// CHECK-SAME: !spv.array<32 x i32 [4]> +// CHECK-SAME: !spv.array<32 x i16 [2]> +// CHECK-SAME: !spv.array<32 x i8 [1]> +func @int_tensor_types( + %arg0: tensor<8x4xi64>, + %arg1: tensor<8x4xi32>, + %arg2: tensor<8x4xi16>, + %arg3: tensor<8x4xi8> +) { return } + +// CHECK-LABEL: spv.func @float_tensor_types +// CHECK-SAME: !spv.array<32 x f64 [8]> +// CHECK-SAME: !spv.array<32 x f32 [4]> +// CHECK-SAME: !spv.array<32 x f16 [2]> +func @float_tensor_types( + %arg0: tensor<8x4xf64>, + %arg1: tensor<8x4xf32>, + %arg2: tensor<8x4xf16> +) { return } + +} // end module + +// ----- + +// Check that tensor element types are changed to 32-bit without capabilities. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @int_tensor_types +// CHECK-SAME: !spv.array<32 x i32 [4]> +// CHECK-SAME: !spv.array<32 x i32 [4]> +// CHECK-SAME: !spv.array<32 x i32 [4]> +// CHECK-SAME: !spv.array<32 x i32 [4]> +func @int_tensor_types( + %arg0: tensor<8x4xi64>, + %arg1: tensor<8x4xi32>, + %arg2: tensor<8x4xi16>, + %arg3: tensor<8x4xi8> +) { return } + +// CHECK-LABEL: spv.func @float_tensor_types +// CHECK-SAME: !spv.array<32 x f32 [4]> +// CHECK-SAME: !spv.array<32 x f32 [4]> +// CHECK-SAME: !spv.array<32 x f32 [4]> +func @float_tensor_types( + %arg0: tensor<8x4xf64>, + %arg1: tensor<8x4xf32>, + %arg2: tensor<8x4xf16> +) { return } + +} // end module + +// ----- + +// Check that dynamic shapes are not supported. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: func @unranked_tensor +// CHECK-SAME: tensor<*xi32> +func @unranked_tensor(%arg0: tensor<*xi32>) { return } + +// CHECK-LABEL: func @dynamic_dim_tensor +// CHECK-SAME: tensor<8x?xi32> +func @dynamic_dim_tensor(%arg0: tensor<8x?xi32>) { return } + +} // end module diff --git a/mlir/test/Conversion/StandardToSPIRV/subview-to-spirv.mlir b/mlir/test/Conversion/StandardToSPIRV/subview-to-spirv.mlir index c9d1195bc0562f..cc94c089dfb23a 100644 --- a/mlir/test/Conversion/StandardToSPIRV/subview-to-spirv.mlir +++ b/mlir/test/Conversion/StandardToSPIRV/subview-to-spirv.mlir @@ -4,6 +4,13 @@ // the desired output. Adding all of patterns within a single pass does // not seem to work. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + //===----------------------------------------------------------------------===// // std.subview //===----------------------------------------------------------------------===// @@ -51,3 +58,5 @@ func @fold_static_stride_subview_with_store(%arg0 : memref<12x32xf32>, %arg1 : i store %arg5, %0[%arg3, %arg4] : memref<4x4xf32, offset:?, strides: [64, 3]> return } + +} // end module diff --git a/mlir/test/Dialect/GPU/multiple-all-reduce.mlir b/mlir/test/Dialect/GPU/multiple-all-reduce.mlir new file mode 100644 index 00000000000000..f1437dbb1adb27 --- /dev/null +++ b/mlir/test/Dialect/GPU/multiple-all-reduce.mlir @@ -0,0 +1,25 @@ +// RUN: mlir-opt --gpu-kernel-outlining --convert-gpu-to-nvvm %s | FileCheck %s + +func @main() { + %data = alloc() : memref<2x6xf32> + %sum = alloc() : memref<2xf32> + %mul = alloc() : memref<2xf32> + %c1 = constant 1 : index + + // ADD + MUL + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1) + threads(%tx, %ty, %tz) in (%block_x = %c1, %block_y = %c1, %block_z = %c1) { + %val = load %data[%bx, %tx] : memref<2x6xf32> + %reduced0 = "gpu.all_reduce"(%val) ({}) { op = "add" } : (f32) -> (f32) + store %reduced0, %sum[%bx] : memref<2xf32> + %reduced1 = "gpu.all_reduce"(%val) ({}) { op = "mul" } : (f32) -> (f32) + store %reduced1, %mul[%bx] : memref<2xf32> + gpu.terminator + } + +// CHECK: gpu.module @main_kernel { +// CHECK-NEXT: llvm.mlir.global internal @{{.*}}() {addr_space = 3 : i32} : !llvm<"[32 x float]"> +// CHECK-NEXT: llvm.mlir.global internal @{{.*}}() {addr_space = 3 : i32} : !llvm<"[32 x float]"> + + return +} diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir index 03de594083dde4..710328c1cfc890 100644 --- a/mlir/test/Dialect/LLVMIR/invalid.mlir +++ b/mlir/test/Dialect/LLVMIR/invalid.mlir @@ -515,7 +515,7 @@ func @cmpxchg_failure_acq_rel(%i32_ptr : !llvm<"i32*">, %i32 : !llvm.i32) { llvm.func @foo(!llvm.i32) -> !llvm.i32 llvm.func @__gxx_personality_v0(...) -> !llvm.i32 -llvm.func @bad_landingpad(%arg0: !llvm<"i8**">) { +llvm.func @bad_landingpad(%arg0: !llvm<"i8**">) attributes { personality = @__gxx_personality_v0} { %0 = llvm.mlir.constant(3 : i32) : !llvm.i32 %1 = llvm.mlir.constant(2 : i32) : !llvm.i32 %2 = llvm.invoke @foo(%1) to ^bb1 unwind ^bb2 : (!llvm.i32) -> !llvm.i32 @@ -532,7 +532,7 @@ llvm.func @bad_landingpad(%arg0: !llvm<"i8**">) { llvm.func @foo(!llvm.i32) -> !llvm.i32 llvm.func @__gxx_personality_v0(...) -> !llvm.i32 -llvm.func @caller(%arg0: !llvm.i32) -> !llvm.i32 { +llvm.func @caller(%arg0: !llvm.i32) -> !llvm.i32 attributes { personality = @__gxx_personality_v0} { %0 = llvm.mlir.constant(1 : i32) : !llvm.i32 %1 = llvm.alloca %0 x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**"> // expected-note@+1 {{global addresses expected as operand to bitcast used in clauses for landingpad}} @@ -551,7 +551,7 @@ llvm.func @caller(%arg0: !llvm.i32) -> !llvm.i32 { llvm.func @foo(!llvm.i32) -> !llvm.i32 llvm.func @__gxx_personality_v0(...) -> !llvm.i32 -llvm.func @caller(%arg0: !llvm.i32) -> !llvm.i32 { +llvm.func @caller(%arg0: !llvm.i32) -> !llvm.i32 attributes { personality = @__gxx_personality_v0} { %0 = llvm.mlir.constant(1 : i32) : !llvm.i32 %1 = llvm.invoke @foo(%0) to ^bb1 unwind ^bb2 : (!llvm.i32) -> !llvm.i32 ^bb1: // pred: ^bb0 @@ -564,6 +564,37 @@ llvm.func @caller(%arg0: !llvm.i32) -> !llvm.i32 { // ----- +llvm.func @foo(!llvm.i32) -> !llvm.i32 +llvm.func @__gxx_personality_v0(...) -> !llvm.i32 + +llvm.func @caller(%arg0: !llvm.i32) -> !llvm.i32 attributes { personality = @__gxx_personality_v0 } { + %0 = llvm.mlir.constant(1 : i32) : !llvm.i32 + %1 = llvm.invoke @foo(%0) to ^bb1 unwind ^bb2 : (!llvm.i32) -> !llvm.i32 +^bb1: // pred: ^bb0 + llvm.return %0 : !llvm.i32 +^bb2: // pred: ^bb0 + %2 = llvm.landingpad cleanup : !llvm<"{ i8*, i32 }"> + // expected-error@+1 {{'llvm.resume' op expects landingpad value as operand}} + llvm.resume %0 : !llvm.i32 +} + +// ----- + +llvm.func @foo(!llvm.i32) -> !llvm.i32 + +llvm.func @caller(%arg0: !llvm.i32) -> !llvm.i32 { + %0 = llvm.mlir.constant(1 : i32) : !llvm.i32 + %1 = llvm.invoke @foo(%0) to ^bb1 unwind ^bb2 : (!llvm.i32) -> !llvm.i32 +^bb1: // pred: ^bb0 + llvm.return %0 : !llvm.i32 +^bb2: // pred: ^bb0 + // expected-error@+1 {{llvm.landingpad needs to be in a function with a personality}} + %2 = llvm.landingpad cleanup : !llvm<"{ i8*, i32 }"> + llvm.resume %2 : !llvm<"{ i8*, i32 }"> +} + +// ----- + func @invalid_ordering_in_fence() { // expected-error @+1 {{can be given only acquire, release, acq_rel, and seq_cst orderings}} llvm.fence syncscope("agent") monotonic diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir index 32fe4c496523ad..8e08d5004d69db 100644 --- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir +++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir @@ -238,7 +238,7 @@ llvm.func @bar(!llvm<"i8*">, !llvm<"i8*">, !llvm<"i8*">) llvm.func @__gxx_personality_v0(...) -> !llvm.i32 // CHECK-LABEL: @invokeLandingpad -llvm.func @invokeLandingpad() -> !llvm.i32 { +llvm.func @invokeLandingpad() -> !llvm.i32 attributes { personality = @__gxx_personality_v0 } { // CHECK-NEXT: %[[a0:[0-9]+]] = llvm.mlir.constant(0 : i32) : !llvm.i32 // CHECK-NEXT: %{{[0-9]+}} = llvm.mlir.constant(3 : i32) : !llvm.i32 // CHECK-NEXT: %[[a2:[0-9]+]] = llvm.mlir.constant("\01") : !llvm<"[1 x i8]"> @@ -261,11 +261,11 @@ llvm.func @invokeLandingpad() -> !llvm.i32 { %9 = llvm.invoke @foo(%7) to ^bb2 unwind ^bb1 : (!llvm.i32) -> !llvm<"{ i32, double, i32 }"> // CHECK-NEXT: ^bb1: -// CHECK-NEXT: %{{[0-9]+}} = llvm.landingpad cleanup (catch %[[a3]] : !llvm<"i8**">) (catch %[[a6]] : !llvm<"i8*">) (filter %[[a2]] : !llvm<"[1 x i8]">) : !llvm<"{ i8*, i32 }"> -// CHECK-NEXT: llvm.br ^bb3 +// CHECK-NEXT: %[[lp:[0-9]+]] = llvm.landingpad cleanup (catch %[[a3]] : !llvm<"i8**">) (catch %[[a6]] : !llvm<"i8*">) (filter %[[a2]] : !llvm<"[1 x i8]">) : !llvm<"{ i8*, i32 }"> +// CHECK-NEXT: llvm.resume %[[lp]] : !llvm<"{ i8*, i32 }"> ^bb1: %10 = llvm.landingpad cleanup (catch %3 : !llvm<"i8**">) (catch %6 : !llvm<"i8*">) (filter %2 : !llvm<"[1 x i8]">) : !llvm<"{ i8*, i32 }"> - llvm.br ^bb3 + llvm.resume %10 : !llvm<"{ i8*, i32 }"> // CHECK-NEXT: ^bb2: // CHECK-NEXT: llvm.return %[[a7]] : !llvm.i32 diff --git a/mlir/test/Dialect/SPIRV/Transforms/abi-simple.mlir b/mlir/test/Dialect/SPIRV/Transforms/abi-interface.mlir similarity index 57% rename from mlir/test/Dialect/SPIRV/Transforms/abi-simple.mlir rename to mlir/test/Dialect/SPIRV/Transforms/abi-interface.mlir index edc66c41591cfe..3972def985bb98 100644 --- a/mlir/test/Dialect/SPIRV/Transforms/abi-simple.mlir +++ b/mlir/test/Dialect/SPIRV/Transforms/abi-interface.mlir @@ -1,18 +1,25 @@ // RUN: mlir-opt -spirv-lower-abi-attrs -verify-diagnostics %s -o - | FileCheck %s +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + // CHECK-LABEL: spv.module spv.module Logical GLSL450 { // CHECK-DAG: spv.globalVariable [[VAR0:@.*]] bind(0, 0) : !spv.ptr, StorageBuffer> // CHECK-DAG: spv.globalVariable [[VAR1:@.*]] bind(0, 1) : !spv.ptr [0]>, StorageBuffer> // CHECK: spv.func [[FN:@.*]]() - spv.func @kernel(%arg0: f32 - {spv.interface_var_abi = {binding = 0 : i32, - descriptor_set = 0 : i32, - storage_class = 12 : i32}}, - %arg1: !spv.ptr>, StorageBuffer> - {spv.interface_var_abi = {binding = 1 : i32, - descriptor_set = 0 : i32, - storage_class = 12 : i32}}) "None" + spv.func @kernel( + %arg0: f32 + {spv.interface_var_abi = {binding = 0 : i32, + descriptor_set = 0 : i32, + storage_class = 12 : i32}}, + %arg1: !spv.ptr>, StorageBuffer> + {spv.interface_var_abi = {binding = 1 : i32, + descriptor_set = 0 : i32}}) "None" attributes {spv.entry_point_abi = {local_size = dense<[32, 1, 1]> : vector<3xi32>}} { // CHECK: [[ARG1:%.*]] = spv._address_of [[VAR1]] // CHECK: [[ADDRESSARG0:%.*]] = spv._address_of [[VAR0]] @@ -24,4 +31,6 @@ spv.module Logical GLSL450 { } // CHECK: spv.EntryPoint "GLCompute" [[FN]] // CHECK: spv.ExecutionMode [[FN]] "LocalSize", 32, 1, 1 -} +} // end spv.module + +} // end module diff --git a/mlir/test/Dialect/SPIRV/Transforms/abi-load-store.mlir b/mlir/test/Dialect/SPIRV/Transforms/abi-load-store.mlir index d8af9fa8260758..42ff3f55e1ead4 100644 --- a/mlir/test/Dialect/SPIRV/Transforms/abi-load-store.mlir +++ b/mlir/test/Dialect/SPIRV/Transforms/abi-load-store.mlir @@ -1,5 +1,12 @@ // RUN: mlir-opt -spirv-lower-abi-attrs -verify-diagnostics %s -o - | FileCheck %s +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + // CHECK-LABEL: spv.module spv.module Logical GLSL450 { // CHECK-DAG: spv.globalVariable [[WORKGROUPSIZE:@.*]] built_in("WorkgroupSize") @@ -21,16 +28,13 @@ spv.module Logical GLSL450 { spv.func @load_store_kernel( %arg0: !spv.ptr>>, StorageBuffer> {spv.interface_var_abi = {binding = 0 : i32, - descriptor_set = 0 : i32, - storage_class = 12 : i32}}, + descriptor_set = 0 : i32}}, %arg1: !spv.ptr>>, StorageBuffer> {spv.interface_var_abi = {binding = 1 : i32, - descriptor_set = 0 : i32, - storage_class = 12 : i32}}, + descriptor_set = 0 : i32}}, %arg2: !spv.ptr>>, StorageBuffer> {spv.interface_var_abi = {binding = 2 : i32, - descriptor_set = 0 : i32, - storage_class = 12 : i32}}, + descriptor_set = 0 : i32}}, %arg3: i32 {spv.interface_var_abi = {binding = 3 : i32, descriptor_set = 0 : i32, @@ -122,4 +126,6 @@ spv.module Logical GLSL450 { } // CHECK: spv.EntryPoint "GLCompute" [[FN]], [[WORKGROUPID]], [[LOCALINVOCATIONID]], [[NUMWORKGROUPS]], [[WORKGROUPSIZE]] // CHECK-NEXT: spv.ExecutionMode [[FN]] "LocalSize", 32, 1, 1 -} +} // end spv.module + +} // end module diff --git a/mlir/test/Dialect/SPIRV/target-and-abi.mlir b/mlir/test/Dialect/SPIRV/target-and-abi.mlir index a28ca29e0ab9e0..2c380e8ff03964 100644 --- a/mlir/test/Dialect/SPIRV/target-and-abi.mlir +++ b/mlir/test/Dialect/SPIRV/target-and-abi.mlir @@ -14,7 +14,7 @@ func @unknown_attr_on_region(%arg: i32 {spv.something}) { // ----- -// expected-error @+1 {{found unsupported 'spv.something' attribute on region result}} +// expected-error @+1 {{cannot attach SPIR-V attributes to region result}} func @unknown_attr_on_region() -> (i32 {spv.something}) { %0 = constant 10.0 : f32 return %0: f32 @@ -51,14 +51,14 @@ func @spv_entry_point() attributes { // spv.interface_var_abi //===----------------------------------------------------------------------===// -// expected-error @+1 {{'spv.interface_var_abi' attribute must be a dictionary attribute containing three 32-bit integer attributes: 'descriptor_set', 'binding', and 'storage_class'}} +// expected-error @+1 {{'spv.interface_var_abi' attribute must be a dictionary attribute containing two or three 32-bit integer attributes: 'descriptor_set', 'binding', and optional 'storage_class'}} func @interface_var( %arg0 : f32 {spv.interface_var_abi = 64} ) { return } // ----- -// expected-error @+1 {{'spv.interface_var_abi' attribute must be a dictionary attribute containing three 32-bit integer attributes: 'descriptor_set', 'binding', and 'storage_class'}} +// expected-error @+1 {{'spv.interface_var_abi' attribute must be a dictionary attribute containing two or three 32-bit integer attributes: 'descriptor_set', 'binding', and optional 'storage_class'}} func @interface_var( %arg0 : f32 {spv.interface_var_abi = {binding = 0: i32}} ) { return } @@ -74,31 +74,12 @@ func @interface_var( // ----- -// expected-error @+1 {{'spv.interface_var_abi' attribute must be a dictionary attribute containing three 32-bit integer attributes: 'descriptor_set', 'binding', and 'storage_class'}} -func @interface_var() -> (f32 {spv.interface_var_abi = 64}) -{ - %0 = constant 10.0 : f32 - return %0: f32 -} - -// ----- - -// expected-error @+1 {{'spv.interface_var_abi' attribute must be a dictionary attribute containing three 32-bit integer attributes: 'descriptor_set', 'binding', and 'storage_class'}} -func @interface_var() -> (f32 {spv.interface_var_abi = {binding = 0: i32}}) -{ - %0 = constant 10.0 : f32 - return %0: f32 -} - -// ----- - -// CHECK: {spv.interface_var_abi = {binding = 0 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32}} -func @interface_var() -> (f32 {spv.interface_var_abi = { - binding = 0 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32}}) -{ - %0 = constant 10.0 : f32 - return %0: f32 -} +// expected-error @+1 {{'spv.interface_var_abi' attribute cannot specify storage class when attaching to a non-scalar value}} +func @interface_var( + %arg0 : memref<4xf32> {spv.interface_var_abi = {binding = 0 : i32, + descriptor_set = 0 : i32, + storage_class = 12 : i32}} +) { return } // ----- diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir index 91f6850779a984..d9093edb3765b1 100644 --- a/mlir/test/Dialect/Vector/invalid.mlir +++ b/mlir/test/Dialect/Vector/invalid.mlir @@ -1046,3 +1046,10 @@ func @reduce_unsupported_rank(%arg0: vector<4x16xf32>) -> f32 { // expected-error@+1 {{'vector.reduction' op unsupported reduction rank: 2}} %0 = vector.reduction "add", %arg0 : vector<4x16xf32> into f32 } + +// ----- + +func @type_cast_layout(%arg0: memref<4x3xf32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s0 + d1 * s1 + s2)>>) { + // expected-error@+1 {{expects operand to be a memref with no layout}} + %0 = vector.type_cast %arg0: memref<4x3xf32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s0 + d1 * s1 + s2)>> to memref> +} diff --git a/mlir/test/Target/import.ll b/mlir/test/Target/import.ll index 0394309093f290..23fc219168888c 100644 --- a/mlir/test/Target/import.ll +++ b/mlir/test/Target/import.ll @@ -282,8 +282,7 @@ define i32 @invokeLandingpad() personality i8* bitcast (i32 (...)* @__gxx_person ; FIXME: Change filter to a constant array once they are handled. ; Currently, even though it parses this, LLVM module is broken filter [1 x i8] [i8 1] - ; CHECK: llvm.br ^bb3 - br label %5 + resume { i8*, i32 } %3 ; CHECK: ^bb2: ; CHECK: llvm.return %{{[0-9]+}} : !llvm.i32 diff --git a/mlir/test/Target/llvmir-intrinsics.mlir b/mlir/test/Target/llvmir-intrinsics.mlir index f0f17966e0c38f..7be5e5fd596e5e 100644 --- a/mlir/test/Target/llvmir-intrinsics.mlir +++ b/mlir/test/Target/llvmir-intrinsics.mlir @@ -152,6 +152,20 @@ llvm.func @matrix_intrinsics(%A: !llvm<"<64 x float>">, %B: !llvm<"<48 x float>" llvm.return } +// CHECK-LABEL: @masked_intrinsics +llvm.func @masked_intrinsics(%A: !llvm<"<7 x float>*">, %mask: !llvm<"<7 x i1>">) { + // CHECK: call <7 x float> @llvm.masked.load.v7f32.p0v7f32(<7 x float>* %{{.*}}, i32 1, <7 x i1> %{{.*}}, <7 x float> undef) + %a = llvm.intr.masked.load %A, %mask { alignment = 1: i32} : + (!llvm<"<7 x float>*">, !llvm<"<7 x i1>">) -> !llvm<"<7 x float>"> + // CHECK: call <7 x float> @llvm.masked.load.v7f32.p0v7f32(<7 x float>* %{{.*}}, i32 1, <7 x i1> %{{.*}}, <7 x float> %{{.*}}) + %b = llvm.intr.masked.load %A, %mask, %a { alignment = 1: i32} : + (!llvm<"<7 x float>*">, !llvm<"<7 x i1>">, !llvm<"<7 x float>">) -> !llvm<"<7 x float>"> + // CHECK: call void @llvm.masked.store.v7f32.p0v7f32(<7 x float> %{{.*}}, <7 x float>* %0, i32 {{.*}}, <7 x i1> %{{.*}}) + llvm.intr.masked.store %b, %A, %mask { alignment = 1: i32} : + !llvm<"<7 x float>">, !llvm<"<7 x i1>"> into !llvm<"<7 x float>*"> + llvm.return +} + // Check that intrinsics are declared with appropriate types. // CHECK-DAG: declare float @llvm.fma.f32(float, float, float) // CHECK-DAG: declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) #0 @@ -179,3 +193,5 @@ llvm.func @matrix_intrinsics(%A: !llvm<"<64 x float>">, %B: !llvm<"<48 x float>" // CHECK-DAG: declare <48 x float> @llvm.matrix.transpose.v48f32(<48 x float>, i32 immarg, i32 immarg) // CHECK-DAG: declare <48 x float> @llvm.matrix.columnwise.load.v48f32.p0f32(float*, i32, i32 immarg, i32 immarg) // CHECK-DAG: declare void @llvm.matrix.columnwise.store.v48f32.p0f32(<48 x float>, float* writeonly, i32, i32 immarg, i32 immarg) +// CHECK-DAG: declare <7 x float> @llvm.masked.load.v7f32.p0v7f32(<7 x float>*, i32 immarg, <7 x i1>, <7 x float>) +// CHECK-DAG: declare void @llvm.masked.store.v7f32.p0v7f32(<7 x float>, <7 x float>*, i32 immarg, <7 x i1>) diff --git a/mlir/test/Target/llvmir.mlir b/mlir/test/Target/llvmir.mlir index 43cc7d804daec2..59c43b82cca5fd 100644 --- a/mlir/test/Target/llvmir.mlir +++ b/mlir/test/Target/llvmir.mlir @@ -1137,7 +1137,7 @@ llvm.func @bar(!llvm<"i8*">) -> !llvm<"i8*"> llvm.func @__gxx_personality_v0(...) -> !llvm.i32 // CHECK-LABEL: @invokeLandingpad -llvm.func @invokeLandingpad() -> !llvm.i32 { +llvm.func @invokeLandingpad() -> !llvm.i32 attributes { personality = @__gxx_personality_v0 } { // CHECK: %[[a1:[0-9]+]] = alloca i8 %0 = llvm.mlir.constant(0 : i32) : !llvm.i32 %1 = llvm.mlir.constant("\01") : !llvm<"[1 x i8]"> diff --git a/mlir/test/Transforms/inlining-dce.mlir b/mlir/test/Transforms/inlining-dce.mlir new file mode 100644 index 00000000000000..d9c8bf983c195d --- /dev/null +++ b/mlir/test/Transforms/inlining-dce.mlir @@ -0,0 +1,53 @@ +// RUN: mlir-opt %s -inline | FileCheck %s + +// This file tests the callgraph dead code elimination performed by the inliner. + +// Function is already dead. +// CHECK-NOT: func @dead_function +func @dead_function() attributes {sym_visibility = "private"} { + return +} + +// Function becomes dead after inlining. +// CHECK-NOT: func @dead_function_b +func @dead_function_b() attributes {sym_visibility = "private"} { + return +} + +// CHECK: func @live_function() +func @live_function() { + call @dead_function_b() : () -> () + return +} + +// Same as above, but a transitive example. + +// CHECK: func @live_function_b +func @live_function_b() { + return +} +// CHECK-NOT: func @dead_function_c +func @dead_function_c() attributes {sym_visibility = "private"} { + call @live_function_b() : () -> () + return +} +// CHECK-NOT: func @dead_function_d +func @dead_function_d() attributes {sym_visibility = "private"} { + call @dead_function_c() : () -> () + call @dead_function_c() : () -> () + return +} +// CHECK: func @live_function_c +func @live_function_c() { + call @dead_function_c() : () -> () + call @dead_function_d() : () -> () + return +} + +// Function is referenced by non-callable top-level user. +// CHECK: func @live_function_d +func @live_function_d() attributes {sym_visibility = "private"} { + return +} + +"live.user"() {use = @live_function_d} : () -> () diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir index 38f87dd2302cc7..dd8330626551f3 100644 --- a/mlir/test/Transforms/test-legalizer.mlir +++ b/mlir/test/Transforms/test-legalizer.mlir @@ -23,6 +23,13 @@ func @remap_input_1_to_1(%arg0: i64) { "test.invalid"(%arg0) : (i64) -> () } +// CHECK-LABEL: func @remap_call_1_to_1(%arg0: f64) +func @remap_call_1_to_1(%arg0: i64) { + // CHECK-NEXT: call @remap_input_1_to_1(%arg0) : (f64) -> () + call @remap_input_1_to_1(%arg0) : (i64) -> () + return +} + // CHECK-LABEL: func @remap_input_1_to_N({{.*}}f16, {{.*}}f16) func @remap_input_1_to_N(%arg0: f32) -> f32 { // CHECK-NEXT: "test.return"{{.*}} : (f16, f16) -> () diff --git a/mlir/test/lib/Dialect/SPIRV/TestAvailability.cpp b/mlir/test/lib/Dialect/SPIRV/TestAvailability.cpp index a91800d68fc04b..ad77e7d05f42f9 100644 --- a/mlir/test/lib/Dialect/SPIRV/TestAvailability.cpp +++ b/mlir/test/lib/Dialect/SPIRV/TestAvailability.cpp @@ -130,7 +130,12 @@ void ConvertToTargetEnv::runOnFunction() { auto targetEnv = fn.getOperation() ->getAttr(spirv::getTargetEnvAttrName()) .cast(); - auto target = spirv::SPIRVConversionTarget::get(targetEnv, context); + if (!targetEnv) { + fn.emitError("missing 'spv.target_env' attribute"); + return signalPassFailure(); + } + + auto target = spirv::SPIRVConversionTarget::get(targetEnv); OwningRewritePatternList patterns; patterns.insert(&getContext(), converter); mlir::populateFuncOpTypeConversionPattern(patterns, &getContext(), converter); + mlir::populateCallOpTypeConversionPattern(patterns, &getContext(), + converter); // Define the conversion target used for the test. ConversionTarget target(getContext()); diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt index bc737a0a119fbf..61d1443869a94e 100644 --- a/mlir/test/lib/Transforms/CMakeLists.txt +++ b/mlir/test/lib/Transforms/CMakeLists.txt @@ -45,6 +45,7 @@ target_link_libraries(MLIRTestTransforms MLIRLoopOps MLIRGPU MLIRPass + MLIRStandardToStandard MLIRTestDialect MLIRTransformUtils MLIRVectorToLoops diff --git a/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir b/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir new file mode 100644 index 00000000000000..2c8eced2d4bee4 --- /dev/null +++ b/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir @@ -0,0 +1,64 @@ +// RUN: mlir-cuda-runner %s --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s + +func @main() { + %data = alloc() : memref<2x6xf32> + %sum = alloc() : memref<2xf32> + %mul = alloc() : memref<2xf32> + %cst0 = constant 0.0 : f32 + %cst1 = constant 1.0 : f32 + %cst2 = constant 2.0 : f32 + %cst4 = constant 4.0 : f32 + %cst8 = constant 8.0 : f32 + %cst16 = constant 16.0 : f32 + + %cst3 = constant 3.0 : f32 + %cst6 = constant 6.0 : f32 + %cst7 = constant 7.0 : f32 + %cst10 = constant 10.0 : f32 + %cst11 = constant 11.0 : f32 + + %c0 = constant 0 : index + %c1 = constant 1 : index + %c2 = constant 2 : index + %c3 = constant 3 : index + %c4 = constant 4 : index + %c5 = constant 5 : index + %c6 = constant 6 : index + + store %cst0, %data[%c0, %c0] : memref<2x6xf32> + store %cst1, %data[%c0, %c1] : memref<2x6xf32> + store %cst2, %data[%c0, %c2] : memref<2x6xf32> + store %cst4, %data[%c0, %c3] : memref<2x6xf32> + store %cst8, %data[%c0, %c4] : memref<2x6xf32> + store %cst16, %data[%c0, %c5] : memref<2x6xf32> + + store %cst2, %data[%c1, %c0] : memref<2x6xf32> + store %cst3, %data[%c1, %c1] : memref<2x6xf32> + store %cst6, %data[%c1, %c2] : memref<2x6xf32> + store %cst7, %data[%c1, %c3] : memref<2x6xf32> + store %cst10, %data[%c1, %c4] : memref<2x6xf32> + store %cst11, %data[%c1, %c5] : memref<2x6xf32> + + // ADD + MUL + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c2, %grid_y = %c1, %grid_z = %c1) + threads(%tx, %ty, %tz) in (%block_x = %c6, %block_y = %c1, %block_z = %c1) { + %val = load %data[%bx, %tx] : memref<2x6xf32> + %reduced0 = "gpu.all_reduce"(%val) ({}) { op = "add" } : (f32) -> (f32) + store %reduced0, %sum[%bx] : memref<2xf32> + %reduced1 = "gpu.all_reduce"(%val) ({}) { op = "mul" } : (f32) -> (f32) + store %reduced1, %mul[%bx] : memref<2xf32> + gpu.terminator + } + + %ptr_sum = memref_cast %sum : memref<2xf32> to memref<*xf32> + call @print_memref_f32(%ptr_sum) : (memref<*xf32>) -> () + // CHECK: [31, 39] + + %ptr_mul = memref_cast %mul : memref<2xf32> to memref<*xf32> + call @print_memref_f32(%ptr_mul) : (memref<*xf32>) -> () + // CHECK: [0, 27720] + + return +} + +func @print_memref_f32(memref<*xf32>)